# Comparação entre modelos

## Preparar o ambiente

In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import mean_squared_error

## Carregar os dados da retroprevisão

Para esta análise, utilizaremos os dados da simulação gerados com o notebook `simulation.ipynb`, bem como dados de casos retirados do [Brasil.IO](https://brasil.io/dataset/covid19/caso_full/) e tratados pelo [CoronaCidades](https://github.com/ImpulsoGov/coronacidades-datasource/).

In [4]:
# load historical predictions
df_predictions = pd.read_csv("../data/br-states-simulacovid-predictions.csv")
df_predictions["date_prediction"] = pd.to_datetime(df_predictions["date_prediction"])

df_predictions

Unnamed: 0,days,S,E0,E1,I0,I1,I2,I3,R,D,N,E,scenario,model,date_prediction,state_num_id
0,1,4.565597e+07,1.000644e+05,6.670963e+04,2.755470e+04,6.140671e+04,2389.728066,497.860014,4.367000e+03,84.000000,4.591905e+07,1.667741e+05,worst,SEAPMDR,2020-03-28,35
1,2,1.396044e-09,3.278580e+07,1.010676e+07,8.520469e+05,1.985665e+06,6559.756994,534.861761,1.815116e+05,170.119344,4.591905e+07,4.289256e+07,worst,SEAPMDR,2020-03-28,35
2,3,-1.502009e-10,2.349205e+07,1.334298e+07,2.401212e+06,5.600759e+06,27414.526944,893.371300,1.053456e+06,283.465823,4.591905e+07,3.683503e+07,worst,SEAPMDR,2020-03-28,35
3,4,-6.815688e-14,1.683279e+07,1.326121e+07,3.895125e+06,9.086873e+06,66014.250543,2026.532316,2.774495e+06,517.519017,4.591905e+07,3.009400e+07,worst,SEAPMDR,2020-03-28,35
4,5,3.954639e-16,1.206122e+07,1.174656e+07,5.031299e+06,1.173822e+07,116438.288104,4248.530970,5.220026e+06,1033.420015,4.591905e+07,2.380779e+07,worst,SEAPMDR,2020-03-28,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774039,87,2.211629e+06,,,,2.270479e+03,864.849935,244.496192,7.801478e+05,10929.996403,3.007711e+06,1.624007e+03,best,SEIR,2021-01-19,53
2774040,88,2.211346e+06,,,,2.217484e+03,844.731456,238.850212,7.805132e+05,10964.387711,3.007711e+06,1.585995e+03,best,SEIR,2021-01-19,53
2774041,89,2.211070e+06,,,,2.165706e+03,825.071624,233.330554,7.808700e+05,10997.984553,3.007711e+06,1.548859e+03,best,SEIR,2021-01-19,53
2774042,90,2.210800e+06,,,,2.115117e+03,805.860408,227.934625,7.812185e+05,11030.804718,3.007711e+06,1.512581e+03,best,SEIR,2021-01-19,53


In [13]:
# load cases
cases_url = "http://datasource.coronacidades.org/br/states/cases/full"
df_cases = pd.read_csv(cases_url)
df_cases["last_updated"] = pd.to_datetime(df_cases["last_updated"])

df_cases

Unnamed: 0,active_cases,confirmed_cases,daily_cases,daily_cases_diff_14_days,daily_cases_growth,daily_cases_mavg,daily_cases_mavg_100k,deaths,epidemiological_week,estimated_cases,...,new_deaths_growth,new_deaths_mavg,new_deaths_mavg_100k,notification_rate,population,state_id,state_name,state_num_id,total_estimated_cases,data_last_refreshed
0,,1,1,,estabilizando,,,0,12,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-30 01:27:57
1,,3,2,,estabilizando,,,0,12,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-30 01:27:57
2,,3,0,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-30 01:27:57
3,,3,0,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-30 01:27:57
4,,5,2,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-30 01:27:57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8717,,271319,1148,4.0,estabilizando,863.6,28.640904,4490,4,,...,estabilizando,9.0,0.298481,,3015268,DF,Distrito Federal,53,,2021-01-30 01:27:57
8718,,272375,1056,6.0,crescendo,927.0,30.743536,4505,4,,...,estabilizando,9.9,0.328329,,3015268,DF,Distrito Federal,53,,2021-01-30 01:27:57
8719,,273427,1052,6.0,crescendo,988.7,32.789789,4508,4,,...,estabilizando,9.4,0.311747,,3015268,DF,Distrito Federal,53,,2021-01-30 01:27:57
8720,,274601,1174,8.0,crescendo,1037.3,34.401586,4519,4,,...,estabilizando,9.6,0.318380,,3015268,DF,Distrito Federal,53,,2021-01-30 01:27:57


In [23]:
df_cases.dtypes

active_cases                       float64
confirmed_cases                      int64
daily_cases                          int64
daily_cases_diff_14_days           float64
daily_cases_growth                  object
daily_cases_mavg                   float64
daily_cases_mavg_100k              float64
deaths                               int64
epidemiological_week                 int64
estimated_cases                    float64
expected_mortality                 float64
infectious_period_cases            float64
is_last                               bool
is_repeated                           bool
last_updated                datetime64[ns]
new_deaths                           int64
new_deaths_diff_14_days            float64
new_deaths_growth                   object
new_deaths_mavg                    float64
new_deaths_mavg_100k               float64
notification_rate                  float64
population                           int64
state_id                            object
state_name 

In [24]:
# merge DataFrame with predictions with DataFrame with cases in the correspondent date

df_predictions["reference_date"]=(
    df_predictions.apply(lambda row: row["date_prediction"] + pd.Timedelta(days=row["days"]), axis=1)
)

df_predictions = df_predictions.merge(
    df_cases, how="left", left_on=["state_num_id", "reference_date"], right_on=["state_num_id", "last_updated"]
)

df_predictions

Unnamed: 0,days,S,E0,E1,I0,I1,I2,I3,R,D,...,new_deaths_diff_14_days,new_deaths_growth,new_deaths_mavg,new_deaths_mavg_100k,notification_rate,population,state_id,state_name,total_estimated_cases,data_last_refreshed
0,1,4.565597e+07,1.000644e+05,6.670963e+04,2.755470e+04,6.140671e+04,2389.728066,497.860014,4.367000e+03,84.000000,...,13.0,crescendo,10.9,0.023737,0.013834,45919049.0,SP,São Paulo,67933.0,2021-01-30 01:27:57
1,2,1.396044e-09,3.278580e+07,1.010676e+07,8.520469e+05,1.985665e+06,6559.756994,534.861761,1.815116e+05,170.119344,...,14.0,crescendo,11.9,0.025915,0.013878,45919049.0,SP,São Paulo,74908.0,2021-01-30 01:27:57
2,3,-1.502009e-10,2.349205e+07,1.334298e+07,2.401212e+06,5.600759e+06,27414.526944,893.371300,1.053456e+06,283.465823,...,14.0,crescendo,13.7,0.029835,0.021454,45919049.0,SP,São Paulo,81704.0,2021-01-30 01:27:57
3,4,-6.815688e-14,1.683279e+07,1.326121e+07,3.895125e+06,9.086873e+06,66014.250543,2026.532316,2.774495e+06,517.519017,...,14.0,crescendo,16.6,0.036151,0.027281,45919049.0,SP,São Paulo,88588.0,2021-01-30 01:27:57
4,5,3.954639e-16,1.206122e+07,1.174656e+07,5.031299e+06,1.173822e+07,116438.288104,4248.530970,5.220026e+06,1033.420015,...,14.0,crescendo,18.6,0.040506,0.030533,45919049.0,SP,São Paulo,95719.0,2021-01-30 01:27:57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774039,87,2.211629e+06,,,,2.270479e+03,864.849935,244.496192,7.801478e+05,10929.996403,...,,,,,,,,,,
2774040,88,2.211346e+06,,,,2.217484e+03,844.731456,238.850212,7.805132e+05,10964.387711,...,,,,,,,,,,
2774041,89,2.211070e+06,,,,2.165706e+03,825.071624,233.330554,7.808700e+05,10997.984553,...,,,,,,,,,,
2774042,90,2.210800e+06,,,,2.115117e+03,805.860408,227.934625,7.812185e+05,11030.804718,...,,,,,,,,,,


## Comparar previsto com efetivo

Utilizaremos as retroprojeções para estimar a qualidade de ambos os modelos em análise.

In [25]:
# filter dataset
df_predictions["I"] = df_predictions["I0"].fillna(0) + df_predictions["I1"] + df_predictions["I2"] + df_predictions["I3"]
predicted_vs_seen = df_predictions[["model", "days", "scenario", "reference_date", "state_num_id", "I", "estimated_cases", "population"]].dropna()
predicted_vs_seen["predicted_ppc"] = predicted_vs_seen["I"] / predicted_vs_seen["population"]
predicted_vs_seen["infected_ppc"] = predicted_vs_seen["estimated_cases"] / predicted_vs_seen["population"]
predicted_vs_seen.drop(columns=["I", "estimated_cases", "population"])

predicted_vs_seen

Unnamed: 0,model,days,scenario,reference_date,state_num_id,I,estimated_cases,population,predicted_ppc,infected_ppc
0,SEAPMDR,1,worst,2020-03-29,35,9.184900e+04,6679.0,45919049.0,0.002000,0.000145
1,SEAPMDR,2,worst,2020-03-30,35,2.844807e+06,6975.0,45919049.0,0.061953,0.000152
2,SEAPMDR,3,worst,2020-03-31,35,8.030278e+06,6796.0,45919049.0,0.174879,0.000148
3,SEAPMDR,4,worst,2020-04-01,35,1.305004e+07,6884.0,45919049.0,0.284197,0.000150
4,SEAPMDR,5,worst,2020-04-02,35,1.689020e+07,7131.0,45919049.0,0.367826,0.000155
...,...,...,...,...,...,...,...,...,...,...
2675309,SEIR,1,best,2021-01-10,52,2.685690e+04,4789.0,7018354.0,0.003827,0.000682
2675400,SEAPMDR,1,worst,2021-01-10,53,2.519100e+04,1679.0,3015268.0,0.008354,0.000557
2675491,SEAPMDR,1,best,2021-01-10,53,2.519100e+04,1679.0,3015268.0,0.008354,0.000557
2675582,SEIR,1,worst,2021-01-10,53,1.763370e+04,1679.0,3015268.0,0.005848,0.000557


Para avaliar a qualidade dos modelos, utilizamos a [raíz do erro quadrático médio (RMSE)](https://en.wikipedia.org/wiki/Root-mean-square_deviation), que expressa os resíduos não explicados pelo modelo, na mesma escala original (número de infectados per capita) - ou seja, quanto menor, melhor.

In [26]:
def rmse( g ):
    """Generates RMSE values for a pd.DataFrame."""
    # CREDIT: https://stackoverflow.com/a/47914634
    # TODO: abstract column names
    rmse = np.sqrt(mean_squared_error( g['infected_ppc'], g['predicted_ppc'] ) )
    return pd.Series(rmse)

performance = (
    predicted_vs_seen
    .groupby(["model", "days", "scenario"])
    .apply(rmse)
    .rename(columns={0:"rmse"})
    .reset_index()
)

performance

Unnamed: 0,model,days,scenario,rmse
0,SEAPMDR,1,best,0.012188
1,SEAPMDR,1,worst,0.012188
2,SEAPMDR,2,best,0.087816
3,SEAPMDR,2,worst,0.084536
4,SEAPMDR,3,best,0.193574
...,...,...,...,...
359,SEIR,89,worst,0.037086
360,SEIR,90,best,0.030361
361,SEIR,90,worst,0.036518
362,SEIR,91,best,0.030026


### Visualização dos resultados 

O gráfico a seguir fornece uma visualização de como os resíduos de ambos os modelos se comportam com o aumento do número de dias da previsão. 

Conforme se pode pode observar na figura, ambos os modelos tem resultados bastante parecidos nos primeiros dias. Porém, o modelo original (SEIR) tende a manter resíduos significativamente menores do que o modelo modificado (SEAPMDR) a partir do 18º dia, quando o resíduo desse segundo modelo aumenta em mais de uma ordem de grandeza.

In [30]:
performance_best = performance.loc[performance.scenario=="worst",:]
performance_best = performance_best.loc[performance.days<np.int64(90),:]

fig = px.line(
    performance_best,
    x="days",
    y="rmse",
    color="model",
    title="Raíz do Erro Quadrático Médio para modelos SEIR e SEAPMDR",
    labels={
        "model": "Modelo",
        "days": "dias",
        "rmse": "RMSE",
    }
)

fig.show()