# COV-19 Case Prediction
This notebook aims to create models to predict COV-19 cases in 313 different places world wild using GluonTS models.
The data set is downloaded from Kaggle(https://www.kaggle.com/c/covid19-global-forecasting-week-4), you can download them and put all the csv files under a folder called "covid19-global-forecasting-week-4" in the same directory of this notebook.

In [None]:
%matplotlib inline
import mxnet as mx
from mxnet import gluon
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from tqdm.autonotebook import tqdm
from pathlib import Path

In [None]:
prediction_length = 20

## Load data and preprocessing
We first load the data from files. Since the original data doesn't meet the requirements of GluonTS models, we need to do data preprocessing and generate new dataframe where each row represents a time series for a certain place.

In [None]:
total = pd.read_csv("./covid19-global-forecasting-week-4/train.csv", index_col=False)
test = pd.read_csv("./covid19-global-forecasting-week-4/test.csv", index_col=False)

In [None]:
total = total.fillna("")
total["name"] = total["Country_Region"] + "_" + total["Province_State"]
total.head()

In [None]:
test.head()

In [None]:
country_list = sorted(list(set(total["name"])))
date_list = sorted(list(set(total["Date"])))
data_dic = {"name": country_list}

for date in date_list:
    tmp = total[total["Date"]==date]
    tmp = tmp.pivot(index="name", columns="Date", values="ConfirmedCases")
    data_dic[date] = tmp[date].values
new_df = pd.DataFrame(data_dic)
new_df.head()

In [None]:
feature_dic = {}
for date in date_list:
    
    tmp = total[total["Date"]==date]
    tmp = tmp.pivot(index="name", columns="Date", values="Fatalities")
    feature_dic[date] = tmp[date].values
feature_df = pd.DataFrame(feature_dic)
feature_df.head()

## Create training dataset and train the model

In [None]:
from gluonts.dataset.common import load_datasets, ListDataset
from gluonts.dataset.field_names import FieldName


train_df = new_df.drop(["name"], axis=1)
train_target_values = train_df.values
train_feature_values = feature_df.values

test_target_values = train_target_values.copy()
test_feature_values = train_feature_values.copy()
train_target_values = [ts[:-prediction_length] for ts in train_df.values]
train_feature_values = [ts[:-prediction_length] for ts in feature_df.values]

start_date = [pd.Timestamp("2020-01-22", freq='1D') for _ in range(len(new_df))]
train_ds = ListDataset([
    {
        FieldName.TARGET: target,
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: feature
    }
    for (target, start, feature) in zip(train_target_values,
                                         start_date,
                                         train_feature_values)
], freq="D")

test_ds = ListDataset([
    {
        FieldName.TARGET: target,
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: feature
    }
    for (target, start, feature) in zip(test_target_values,
                                         start_date,
                                        test_feature_values)
], freq="D")

In [None]:
from gluonts.model.deepar import DeepAREstimator
from gluonts.distribution.neg_binomial import NegativeBinomialOutput
from gluonts.trainer import Trainer

n = 50
estimator = DeepAREstimator(
    prediction_length=prediction_length,
    freq="D",
    distr_output = NegativeBinomialOutput(),
    trainer=Trainer(
        learning_rate=1e-3,
        epochs=n,
        num_batches_per_epoch=50,
        batch_size=32
    )
)

predictor = estimator.train(train_ds)

## Evaluate the model

In [None]:
from gluonts.evaluation.backtest import make_evaluation_predictions

forecast_it, ts_it = make_evaluation_predictions(
    dataset=test_ds,
    predictor=predictor,
    num_samples=100
)

print("Obtaining time series conditioning values ...")
tss = list(tqdm(ts_it, total=len(test_ds)))
print("Obtaining time series predictions ...")
forecasts = list(tqdm(forecast_it, total=len(test_ds)))

In [None]:
from gluonts.evaluation import Evaluator


class M5Evaluator(Evaluator):

    def get_metrics_per_ts(self, time_series, forecast):
        successive_diff = np.diff(time_series.values.reshape(len(time_series)))
        successive_diff = successive_diff ** 2
        successive_diff = successive_diff[:-prediction_length]
        denom = np.mean(successive_diff)
        pred_values = forecast.samples.mean(axis=0)
        true_values = time_series.values.reshape(len(time_series))[-prediction_length:]
        num = np.mean((pred_values - true_values) ** 2)
        rmsse = num / denom
        metrics = super().get_metrics_per_ts(time_series, forecast)
        metrics["RMSSE"] = rmsse
        return metrics

    def get_aggregate_metrics(self, metric_per_ts):
        wrmsse = metric_per_ts["RMSSE"].mean()
        agg_metric, _ = super().get_aggregate_metrics(metric_per_ts)
        agg_metric["MRMSSE"] = wrmsse
        return agg_metric, metric_per_ts


evaluator = M5Evaluator(quantiles=[0.5, 0.67, 0.95, 0.99])
agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(test_ds))
print(json.dumps(agg_metrics, indent=4))

## Plot graphs for the results

In [None]:
plot_log_path = "./plots/"
directory = os.path.dirname(plot_log_path)
if not os.path.exists(directory):
    os.makedirs(directory)
    

def plot_prob_forecasts(ts_entry, forecast_entry, path, sample_id, inline=True):
    plot_length = 150
    prediction_intervals = (50, 67, 95, 99)
    legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]

    _, ax = plt.subplots(1, 1, figsize=(10, 7))
    ts_entry[-plot_length:].plot(ax=ax)
    forecast_entry.plot(prediction_intervals=prediction_intervals, color='g')
    ax.axvline(ts_entry.index[-prediction_length], color='r')
    plt.legend(legend, loc="upper left")
    if inline:
        plt.show()
        plt.clf()
    else:
        plt.savefig('{}forecast_{}.pdf'.format(path, sample_id))
        plt.close()

print("Plotting time series predictions ...")
for i in tqdm(range(5)):
    ts_entry = tss[i]
    forecast_entry = forecasts[i]
    plot_prob_forecasts(ts_entry, forecast_entry, plot_log_path, i)

## Comments
The result is seemingly good but there is still much space for improvements. The main problem is that the data got from kaggle contain only a few features which limits us from creating more precise models. The current is very close to a baseline model because it contains only one extra feature. The next thing to do is to find additional data on kaggle or from the internet to improve the model.