# Comparação entre modelos

## Preparar o ambiente

In [21]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import mean_squared_error

## Carregar os dados da retroprevisão

Para esta análise, utilizaremos os dados da simulação gerados com o notebook `simulation.ipynb`, bem como dados de casos retirados do [Brasil.IO](https://brasil.io/dataset/covid19/caso_full/) e tratados pelo [CoronaCidades](https://github.com/ImpulsoGov/coronacidades-datasource/).

In [22]:
# load historical predictions
df_predictions = pd.read_csv("../data/br-states-simulacovid-predictions.csv")
df_predictions["date_prediction"] = pd.to_datetime(df_predictions["date_prediction"])

# fill mission values
df_predictions["E0"] = df_predictions["E0"].fillna(0)
df_predictions["E1"] = df_predictions["E1"].fillna(0)
df_predictions["I0"] = df_predictions["I0"].fillna(0)

# sum up infectious subcompartments
df_predictions["I"] = df_predictions["I0"] + df_predictions["I1"] + df_predictions["I2"] + df_predictions["I3"]

df_predictions

Unnamed: 0,days,S,E0,E1,I0,I1,I2,I3,R,D,N,E,scenario,model,date_prediction,state_num_id,I
0,1,1.732049e+06,25912.806364,11160.746304,2217.900000,4992.565145,151.063328,31.471527,705.000000,4.000000,1777225.0,37073.552668,worst,SEAPMDR,2020-04-19,11,7393.000000
1,2,1.721016e+06,28030.924722,13808.962454,3614.379355,8279.039409,157.857568,31.914058,2278.094257,8.262260,1777225.0,41839.887176,worst,SEAPMDR,2020-04-19,11,12083.190390
2,3,1.706539e+06,32478.771646,16316.011132,5142.710324,11868.865484,179.164783,32.690375,4654.920221,12.602302,1777225.0,48794.782778,worst,SEAPMDR,2020-04-19,11,17223.430966
3,4,1.688375e+06,38810.685859,19276.555747,6810.440671,15780.315246,214.091163,34.140381,7906.630186,17.088667,1777225.0,58087.241606,worst,SEAPMDR,2020-04-19,11,22838.987462
4,5,1.666062e+06,46889.011036,22981.920854,8681.686565,20163.552132,262.315575,36.546580,12126.343299,21.831493,1777225.0,69870.931890,worst,SEAPMDR,2020-04-19,11,29144.100853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774039,87,2.211629e+06,0.000000,0.000000,0.000000,2270.478843,864.849935,244.496192,780147.817428,10929.996403,3007710.7,1624.007466,best,SEIR,2021-01-19,53,3379.824970
2774040,88,2.211346e+06,0.000000,0.000000,0.000000,2217.484408,844.731456,238.850212,780513.170436,10964.387711,3007710.7,1585.994636,best,SEIR,2021-01-19,53,3301.066076
2774041,89,2.211070e+06,0.000000,0.000000,0.000000,2165.706149,825.071624,233.330554,780870.002262,10997.984553,3007710.7,1548.859116,best,SEIR,2021-01-19,53,3224.108327
2774042,90,2.210800e+06,0.000000,0.000000,0.000000,2115.117114,805.860408,227.934625,781218.508181,11030.804718,3007710.7,1512.581241,best,SEIR,2021-01-19,53,3148.912147


In [23]:
# load cases
cases_url = "http://datasource.coronacidades.org/br/states/cases/full"
df_cases = pd.read_csv(cases_url)
df_cases["last_updated"] = pd.to_datetime(df_cases["last_updated"])

df_cases

Unnamed: 0,active_cases,confirmed_cases,daily_cases,daily_cases_diff_14_days,daily_cases_growth,daily_cases_mavg,daily_cases_mavg_100k,deaths,epidemiological_week,estimated_cases,...,new_deaths_growth,new_deaths_mavg,new_deaths_mavg_100k,notification_rate,population,state_id,state_name,state_num_id,total_estimated_cases,data_last_refreshed
0,,1,1,,estabilizando,,,0,12,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-31 04:27:30
1,,3,2,,estabilizando,,,0,12,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-31 04:27:30
2,,3,0,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-31 04:27:30
3,,3,0,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-31 04:27:30
4,,5,2,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-31 04:27:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8743,,272375,1056,6.0,crescendo,927.0,30.743536,4505,4,,...,estabilizando,9.9,0.328329,,3015268,DF,Distrito Federal,53,,2021-01-31 04:27:30
8744,,273427,1052,6.0,crescendo,988.7,32.789789,4508,4,,...,estabilizando,9.4,0.311747,,3015268,DF,Distrito Federal,53,,2021-01-31 04:27:30
8745,,274601,1174,8.0,crescendo,1037.3,34.401586,4519,4,,...,estabilizando,9.6,0.318380,,3015268,DF,Distrito Federal,53,,2021-01-31 04:27:30
8746,,275688,1087,8.0,crescendo,1042.0,34.557459,4533,4,,...,estabilizando,10.4,0.344911,,3015268,DF,Distrito Federal,53,,2021-01-31 04:27:30


In [24]:
df_cases.dtypes

active_cases                       float64
confirmed_cases                      int64
daily_cases                          int64
daily_cases_diff_14_days           float64
daily_cases_growth                  object
daily_cases_mavg                   float64
daily_cases_mavg_100k              float64
deaths                               int64
epidemiological_week                 int64
estimated_cases                    float64
expected_mortality                 float64
infectious_period_cases            float64
is_last                               bool
is_repeated                           bool
last_updated                datetime64[ns]
new_deaths                           int64
new_deaths_diff_14_days            float64
new_deaths_growth                   object
new_deaths_mavg                    float64
new_deaths_mavg_100k               float64
notification_rate                  float64
population                           int64
state_id                            object
state_name 

In [25]:
# merge DataFrame with predictions with DataFrame with cases in the correspondent date

df_predictions["reference_date"]=(
    df_predictions.apply(lambda row: row["date_prediction"] + pd.Timedelta(days=row["days"]), axis=1)
)

df_predictions = df_predictions.merge(
    df_cases, how="left", left_on=["state_num_id", "reference_date"], right_on=["state_num_id", "last_updated"]
)

df_predictions

Unnamed: 0,days,S,E0,E1,I0,I1,I2,I3,R,D,...,new_deaths_diff_14_days,new_deaths_growth,new_deaths_mavg,new_deaths_mavg_100k,notification_rate,population,state_id,state_name,total_estimated_cases,data_last_refreshed
0,1,1.732049e+06,25912.806364,11160.746304,2217.900000,4992.565145,151.063328,31.471527,705.000000,4.000000,...,2.0,estabilizando,0.3,0.016880,0.024231,1777225.0,RO,Rondônia,4368.0,2021-01-31 04:27:30
1,2,1.721016e+06,28030.924722,13808.962454,3614.379355,8279.039409,157.857568,31.914058,2278.094257,8.262260,...,3.0,estabilizando,0.4,0.022507,0.026181,1777225.0,RO,Rondônia,4890.0,2021-01-31 04:27:30
2,3,1.706539e+06,32478.771646,16316.011132,5142.710324,11868.865484,179.164783,32.690375,4654.920221,12.602302,...,3.0,estabilizando,0.4,0.022507,0.024691,1777225.0,RO,Rondônia,5430.0,2021-01-31 04:27:30
3,4,1.688375e+06,38810.685859,19276.555747,6810.440671,15780.315246,214.091163,34.140381,7906.630186,17.088667,...,1.0,estabilizando,0.3,0.016880,0.025440,1777225.0,RO,Rondônia,6017.0,2021-01-31 04:27:30
4,5,1.666062e+06,46889.011036,22981.920854,8681.686565,20163.552132,262.315575,36.546580,12126.343299,21.831493,...,2.0,estabilizando,0.4,0.022507,0.030751,1777225.0,RO,Rondônia,6585.0,2021-01-31 04:27:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774039,87,2.211629e+06,0.000000,0.000000,0.000000,2270.478843,864.849935,244.496192,780147.817428,10929.996403,...,,,,,,,,,,
2774040,88,2.211346e+06,0.000000,0.000000,0.000000,2217.484408,844.731456,238.850212,780513.170436,10964.387711,...,,,,,,,,,,
2774041,89,2.211070e+06,0.000000,0.000000,0.000000,2165.706149,825.071624,233.330554,780870.002262,10997.984553,...,,,,,,,,,,
2774042,90,2.210800e+06,0.000000,0.000000,0.000000,2115.117114,805.860408,227.934625,781218.508181,11030.804718,...,,,,,,,,,,


## Comparar previsto com efetivo

Utilizaremos as retroprojeções para estimar a qualidade de ambos os modelos em análise.

In [26]:
# filter dataset
df_predictions["I"] = df_predictions["I0"].fillna(0) + df_predictions["I1"] + df_predictions["I2"] + df_predictions["I3"]
predicted_vs_seen = df_predictions[["model", "days", "scenario", "reference_date", "state_num_id", "I", "active_cases", "population"]].dropna()
predicted_vs_seen["predicted_percapita"] = predicted_vs_seen["I"] / predicted_vs_seen["population"]
predicted_vs_seen["infected_percapita"] = predicted_vs_seen["active_cases"] / predicted_vs_seen["population"]
predicted_vs_seen.drop(columns=["I", "active_cases", "population"])

predicted_vs_seen

Unnamed: 0,model,days,scenario,reference_date,state_num_id,I,active_cases,population,predicted_percapita,infected_percapita
0,SEAPMDR,1,worst,2020-04-20,11,7393.000000,7470.0,1777225.0,0.004160,0.004203
1,SEAPMDR,2,worst,2020-04-21,11,12083.190390,7639.0,1777225.0,0.006799,0.004298
2,SEAPMDR,3,worst,2020-04-22,11,17223.430966,7978.0,1777225.0,0.009691,0.004489
3,SEAPMDR,4,worst,2020-04-23,11,22838.987462,8726.0,1777225.0,0.012851,0.004910
4,SEAPMDR,5,worst,2020-04-24,11,29144.100853,8390.0,1777225.0,0.016399,0.004721
...,...,...,...,...,...,...,...,...,...,...
2770314,SEIR,2,best,2021-01-11,53,18087.710154,23427.0,3015268.0,0.005999,0.007769
2770404,SEAPMDR,1,worst,2021-01-11,53,25191.000000,23427.0,3015268.0,0.008354,0.007769
2770495,SEAPMDR,1,best,2021-01-11,53,25191.000000,23427.0,3015268.0,0.008354,0.007769
2770586,SEIR,1,worst,2021-01-11,53,17633.700000,23427.0,3015268.0,0.005848,0.007769


Para avaliar a qualidade dos modelos, utilizamos a [raíz do erro quadrático médio (RMSE)](https://en.wikipedia.org/wiki/Root-mean-square_deviation), que expressa os resíduos não explicados pelo modelo, na mesma escala original (número de infectados per capita) - ou seja, quanto menor, melhor.

In [27]:
def rmse( g ):
    """Generates RMSE values for a pd.DataFrame."""
    # CREDIT: https://stackoverflow.com/a/47914634
    # TODO: abstract column names
    rmse = np.sqrt(mean_squared_error( g['infected_percapita'], g['predicted_percapita'] ) )
    return pd.Series(rmse)

performance = (
    predicted_vs_seen
    .groupby(["model", "days", "scenario"])
    .apply(rmse)
    .rename(columns={0:"rmse"})
    .reset_index()
)

performance

Unnamed: 0,model,days,scenario,rmse
0,SEAPMDR,1,best,0.001752
1,SEAPMDR,1,worst,0.001752
2,SEAPMDR,2,best,0.002074
3,SEAPMDR,2,worst,0.002111
4,SEAPMDR,3,best,0.002452
...,...,...,...,...
359,SEIR,89,worst,0.032175
360,SEIR,90,best,0.025516
361,SEIR,90,worst,0.031671
362,SEIR,91,best,0.025231


## Visualização dos resultados 

### Análise de resíduos

O gráfico a seguir fornece uma visualização de como os resíduos de ambos os modelos se comportam com o aumento do número de dias da previsão. 

Conforme se pode pode observar na figura, ambos os modelos tem resultados bastante parecidos nos primeiros dias. Porém, o modelo original (SEIR) tende a manter resíduos significativamente menores do que o modelo modificado (SEAPMDR) a partir do 18º dia, quando o resíduo desse segundo modelo aumenta em mais de uma ordem de grandeza.

In [34]:
performance_best = performance.query("scenario=='worst' & days < 91 & days % 2")

fig = px.line(
    performance_best,
    x="days",
    y="rmse",
    color="model",
    title="Raíz do Erro Quadrático Médio para modelos SEIR e SEAPMDR",
    labels={
        "model": "Modelo",
        "days": "dias",
        "rmse": "RMSE",
    }
)

fig.show()

### Curva de casos ativos

In [29]:
active_cases = (
    df_predictions
    .loc[df_predictions["reference_date"]<pd.Timestamp.now()-pd.Timedelta(days=91),:]
    .query("days % 15 == 0")
    .groupby(["model", "days", "scenario", "reference_date"])
    .sum()[["I", "active_cases"]]
    .reset_index()
    .rename(columns={"I": "predicted", "active_cases": "verified"})
    .melt(id_vars=["model", "days", "scenario", "reference_date"], var_name="value_type", value_name="cases")
)

active_cases = active_cases.loc[active_cases["days"]<=90,:]
active_cases

Unnamed: 0,model,days,scenario,reference_date,value_type,cases
0,SEAPMDR,15,best,2020-04-12,predicted,474195.237115
1,SEAPMDR,15,best,2020-04-13,predicted,483841.471739
2,SEAPMDR,15,best,2020-04-14,predicted,459198.735179
3,SEAPMDR,15,best,2020-04-15,predicted,407895.220182
4,SEAPMDR,15,best,2020-04-16,predicted,420525.841853
...,...,...,...,...,...,...
7987,SEIR,90,worst,2020-10-28,verified,828459.000000
7988,SEIR,90,worst,2020-10-29,verified,888265.000000
7989,SEIR,90,worst,2020-10-30,verified,939371.000000
7990,SEIR,90,worst,2020-10-31,verified,972146.000000


In [30]:
fig = px.line(
    active_cases.query("scenario=='best' & model=='SEAPMDR'"),
    x="reference_date",
    y="cases",
    color="value_type",
    title="Casos preditos versus observados",
    facet_col="days",
)

fig.show()