# Comparação entre modelos

## Preparar o ambiente

In [110]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import mean_squared_error

## Carregar os dados da retroprevisão

Para esta análise, utilizaremos os dados da simulação gerados com o notebook `simulation.ipynb`, bem como dados de casos retirados do [Brasil.IO](https://brasil.io/dataset/covid19/caso_full/) e tratados pelo [CoronaCidades](https://github.com/ImpulsoGov/coronacidades-datasource/).

In [77]:
# load historical predictions
df_predictions = pd.read_csv("../data/br-states-simulacovid-predictions.csv")
df_predictions["date_prediction"] = pd.to_datetime(df_predictions["date_prediction"])

df_predictions

Unnamed: 0,days,S,E0,E1,I0,I1,I2,I3,R,D,N,E,scenario,model,date_prediction,state_num_id
0,1,4.645945e+07,-382022.686501,-254681.791001,27554.700000,125319.820244,4876.996054,1016.040845,4367.000000,84.000000,4.598597e+07,-636704.477502,worst,SEAPMDR,2020-03-28,35
1,2,4.656272e+07,-362716.788717,-250731.435238,-11630.314559,24519.586999,100.522754,946.995765,22589.897472,166.509350,4.598597e+07,-613448.223955,worst,SEAPMDR,2020-03-28,35
2,3,4.671211e+07,-387975.225819,-250040.202112,-44335.782829,-59723.463938,1201.845839,842.550316,13645.365421,240.606290,4.598597e+07,-638015.427931,worst,SEAPMDR,2020-03-28,35
3,4,4.690574e+07,-443727.437406,-260767.904762,-72685.526178,-132585.712030,6492.176962,841.003690,-17649.622955,309.553412,4.598597e+07,-704495.342168,worst,SEAPMDR,2020-03-28,35
4,5,4.714730e+07,-524572.904158,-285516.434544,-99145.679195,-200008.371331,14863.038514,1026.907170,-68363.234751,385.717610,4.598597e+07,-810089.338703,worst,SEAPMDR,2020-03-28,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774039,87,2.128117e+06,,,,3869.528170,-1713.816479,-497.028746,895728.748764,-2154.829658,3.026064e+06,2714.390975,best,SEIR,2021-01-19,53
2774040,88,2.127652e+06,,,,3768.787879,-1669.434780,-484.305840,896342.348092,-2189.043296,3.026064e+06,2643.416523,best,SEIR,2021-01-19,53
2774041,89,2.127200e+06,,,,3670.611200,-1626.170497,-471.894533,896939.924598,-2222.380639,3.026064e+06,2574.264106,best,SEIR,2021-01-19,53
2774042,90,2.126759e+06,,,,3574.935954,-1583.997127,-459.788030,897521.888542,-2254.863176,3.026064e+06,2506.888712,best,SEIR,2021-01-19,53


In [79]:
# load cases
cases_url = "http://datasource.coronacidades.org/br/states/cases/full"
df_cases = pd.read_csv(cases_url)
df_cases["last_updated"] = pd.to_datetime(df_cases["last_updated"])

df_cases

Unnamed: 0,active_cases,confirmed_cases,daily_cases,daily_cases_diff_14_days,daily_cases_growth,daily_cases_mavg,daily_cases_mavg_100k,deaths,epidemiological_week,estimated_cases,...,new_deaths_growth,new_deaths_mavg,new_deaths_mavg_100k,notification_rate,population,state_id,state_name,state_num_id,total_estimated_cases,data_last_refreshed
0,,1,1,,estabilizando,,,0,12,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-21 00:25:54
1,,3,2,,estabilizando,,,0,12,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-21 00:25:54
2,,3,0,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-21 00:25:54
3,,3,0,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-21 00:25:54
4,,5,2,,estabilizando,,,0,13,,...,estabilizando,,,,1777225,RO,Rondônia,11,,2021-01-21 00:25:54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8474,,263756,828,4.0,estabilizando,800.4,26.544904,4412,2,,...,estabilizando,8.7,0.288532,,3015268,DF,Distrito Federal,53,,2021-01-21 00:25:54
8475,,264227,471,4.0,estabilizando,773.7,25.659411,4421,3,,...,estabilizando,9.3,0.308430,,3015268,DF,Distrito Federal,53,,2021-01-21 00:25:54
8476,,265274,1047,6.0,crescendo,782.4,25.947942,4427,3,,...,estabilizando,8.4,0.278582,,3015268,DF,Distrito Federal,53,,2021-01-21 00:25:54
8477,,265886,612,4.0,estabilizando,727.0,24.110626,4436,3,,...,estabilizando,8.7,0.288532,,3015268,DF,Distrito Federal,53,,2021-01-21 00:25:54


In [80]:
df_cases.dtypes

active_cases                       float64
confirmed_cases                      int64
daily_cases                          int64
daily_cases_diff_14_days           float64
daily_cases_growth                  object
daily_cases_mavg                   float64
daily_cases_mavg_100k              float64
deaths                               int64
epidemiological_week                 int64
estimated_cases                    float64
expected_mortality                 float64
infectious_period_cases            float64
is_last                               bool
is_repeated                           bool
last_updated                datetime64[ns]
new_deaths                           int64
new_deaths_diff_14_days            float64
new_deaths_growth                   object
new_deaths_mavg                    float64
new_deaths_mavg_100k               float64
notification_rate                  float64
population                           int64
state_id                            object
state_name 

In [81]:
# merge DataFrame with predictions with DataFrame with cases in the correspondent date

df_predictions["reference_date"]=(
    df_predictions.apply(lambda row: row["date_prediction"] + pd.Timedelta(days=row["days"]), axis=1)
)

df_predictions = df_predictions.merge(
    df_cases, how="left", left_on=["state_num_id", "reference_date"], right_on=["state_num_id", "last_updated"]
)

df_predictions

Unnamed: 0,days,S,E0,E1,I0,I1,I2,I3,R,D,...,new_deaths_diff_14_days,new_deaths_growth,new_deaths_mavg,new_deaths_mavg_100k,notification_rate,population,state_id,state_name,total_estimated_cases,data_last_refreshed
0,1,4.645945e+07,-382022.686501,-254681.791001,27554.700000,125319.820244,4876.996054,1016.040845,4367.000000,84.000000,...,13.0,crescendo,10.9,0.023737,0.013845,45919049.0,SP,São Paulo,67892.0,2021-01-21 00:25:54
1,2,4.656272e+07,-362716.788717,-250731.435238,-11630.314559,24519.586999,100.522754,946.995765,22589.897472,166.509350,...,14.0,crescendo,11.9,0.025915,0.013858,45919049.0,SP,São Paulo,74877.0,2021-01-21 00:25:54
2,3,4.671211e+07,-387975.225819,-250040.202112,-44335.782829,-59723.463938,1201.845839,842.550316,13645.365421,240.606290,...,14.0,crescendo,13.7,0.029835,0.021378,45919049.0,SP,São Paulo,81697.0,2021-01-21 00:25:54
3,4,4.690574e+07,-443727.437406,-260767.904762,-72685.526178,-132585.712030,6492.176962,841.003690,-17649.622955,309.553412,...,14.0,crescendo,16.6,0.036151,0.027293,45919049.0,SP,São Paulo,88578.0,2021-01-21 00:25:54
4,5,4.714730e+07,-524572.904158,-285516.434544,-99145.679195,-200008.371331,14863.038514,1026.907170,-68363.234751,385.717610,...,14.0,crescendo,18.6,0.040506,0.030636,45919049.0,SP,São Paulo,95685.0,2021-01-21 00:25:54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774039,87,2.128117e+06,,,,3869.528170,-1713.816479,-497.028746,895728.748764,-2154.829658,...,,,,,,,,,,
2774040,88,2.127652e+06,,,,3768.787879,-1669.434780,-484.305840,896342.348092,-2189.043296,...,,,,,,,,,,
2774041,89,2.127200e+06,,,,3670.611200,-1626.170497,-471.894533,896939.924598,-2222.380639,...,,,,,,,,,,
2774042,90,2.126759e+06,,,,3574.935954,-1583.997127,-459.788030,897521.888542,-2254.863176,...,,,,,,,,,,


## Comparar previsto com efetivo

Utilizaremos as retroprojeções para estimar a qualidade de ambos os modelos em análise.

In [95]:
# filter dataset
df_predictions["I"] = df_predictions["I0"].fillna(0) + df_predictions["I1"] + df_predictions["I2"] + df_predictions["I3"]
predicted_vs_seen = df_predictions[["model", "days", "scenario", "reference_date", "state_num_id", "I", "estimated_cases", "population"]].dropna()
predicted_vs_seen["predicted_ppc"] = predicted_vs_seen["I"] / predicted_vs_seen["population"]
predicted_vs_seen["infected_ppc"] = predicted_vs_seen["estimated_cases"] / predicted_vs_seen["population"]
predicted_vs_seen.drop(columns=["I", "estimated_cases", "population"])

predicted_vs_seen

Unnamed: 0,model,days,scenario,reference_date,state_num_id,I,estimated_cases,population,predicted_ppc,infected_ppc
0,SEAPMDR,1,worst,2020-03-29,35,158767.557143,6674.0,45919049.0,0.003458,0.000145
1,SEAPMDR,2,worst,2020-03-30,35,13936.790959,6985.0,45919049.0,0.000304,0.000152
2,SEAPMDR,3,worst,2020-03-31,35,-102014.850612,6820.0,45919049.0,-0.002222,0.000149
3,SEAPMDR,4,worst,2020-04-01,35,-197938.057556,6881.0,45919049.0,-0.004311,0.000150
4,SEAPMDR,5,worst,2020-04-02,35,-283264.104841,7107.0,45919049.0,-0.006169,0.000155
...,...,...,...,...,...,...,...,...,...,...
2586857,SEIR,1,best,2021-01-01,52,54810.000000,3405.0,7018354.0,0.007810,0.000485
2586948,SEAPMDR,1,worst,2021-01-01,53,43544.442857,1423.0,3015268.0,0.014441,0.000472
2587039,SEAPMDR,1,best,2021-01-01,53,43544.442857,1423.0,3015268.0,0.014441,0.000472
2587130,SEIR,1,worst,2021-01-01,53,35987.142857,1423.0,3015268.0,0.011935,0.000472


Para avaliar a qualidade dos modelos, utilizamos a [raíz do erro quadrático médio (RMSE)](https://en.wikipedia.org/wiki/Root-mean-square_deviation), que expressa os resíduos não explicados pelo modelo, na mesma escala original (número de infectados per capita) - ou seja, quanto menor, melhor.

In [122]:
def rmse( g ):
    """Generates RMSE values for a pd.DataFrame."""
    # CREDIT: https://stackoverflow.com/a/47914634
    # TODO: abstract column names
    rmse = np.sqrt(mean_squared_error( g['infected_ppc'], g['predicted_ppc'] ) )
    return pd.Series(rmse)

performance = (
    predicted_vs_seen
    .groupby(["model", "days", "scenario"])
    .apply(rmse)
    .rename(columns={0:"rmse"})
    .reset_index()
)

performance


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square


overflow encountered in square



Unnamed: 0,model,days,scenario,rmse
0,SEAPMDR,1,best,0.021540
1,SEAPMDR,1,worst,0.021540
2,SEAPMDR,2,best,0.008567
3,SEAPMDR,2,worst,0.007488
4,SEAPMDR,3,best,0.004360
...,...,...,...,...
359,SEIR,89,worst,0.011029
360,SEIR,90,best,0.009571
361,SEIR,90,worst,0.010800
362,SEIR,91,best,0.009466


### Visualização dos resultados 

O gráfico a seguir fornece uma visualização de como os resíduos de ambos os modelos se comportam com o aumento do número de dias da previsão. 

Conforme se pode pode observar na figura, ambos os modelos tem resultados bastante parecidos nos primeiros dias. Porém, o modelo original (SEIR) tende a manter resíduos significativamente menores do que o modelo modificado (SEAPMDR) a partir do 18º dia, quando o resíduo desse segundo modelo aumenta em mais de uma ordem de grandeza.

In [127]:
performance_best = performance.loc[performance.scenario=="best",:]
performance_best = performance_best.loc[performance.days<np.int64(21),:]

fig = px.line(
    performance_best,
    x="days",
    y="rmse",
    color="model",
    title="Raíz do Erro Quadrático Médio para modelos SEIR e SEAPMDR",
    labels={
        "model": "Modelo",
        "days": "dias",
        "rmse": "RMSE",
    }
)

fig.show()