# Comparative Analysis

## Importando bibliotecas

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [96]:
def get_future(df, columns, janela):
    """
    A função pega a base de dados, e para cada ponto (lat,lon), adiciona a respectiva precipitação 2 meses à frente e algumas variáveis nos últimos 4 meses
    
    """
    suffix = 'mais' if janela > 0 else 'menos'
    df_out = df.copy()
    new_columns = [f'{variavel}_{suffix}_{abs(janela)}' for variavel in columns]
    for posicao in df.posicao.unique():
        criteria = "posicao == @posicao"
        df_out.loc[df_out.eval(criteria), new_columns] = (
            df_out
            .query(criteria)
            .shift(periods=-janela)[columns].values
        )
    return df_out

important_columns = [
    #'divergencia', 'umidade', 'vento_vertical', 'vorticidade', 'fluxo_energia', 
    'EMI', 'nino3', 'atl3' #, 'atn', 'ats', 'atlgrad',  'seta', 'nesta'
]
df_original = (
    pd
    .read_csv("../data/raw/data_regiao_hidro.csv")
    .pipe(get_future, ['pr'], 2)
    .pipe(get_future, important_columns, -1)
    .pipe(get_future, important_columns, -2)
    .pipe(get_future, important_columns, -3)
    .pipe(get_future, important_columns, -4)
)

In [97]:
df_original

Unnamed: 0,data,posicao,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,...,atl3_menos_1,EMI_menos_2,nino3_menos_2,atl3_menos_2,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4
0,1981-01-01,"(-4.75, -39.25)",68.92,-2.849130e-06,74.14,-0.01,0.000007,93.54,0.49,-0.63,...,,,,,,,,,,
1,1981-01-01,"(-4.75, -39.0)",59.98,-3.877240e-06,73.94,-0.02,0.000008,120.53,0.49,-0.63,...,,,,,,,,,,
2,1981-01-01,"(-4.75, -38.75)",54.32,-4.451220e-06,73.52,-0.03,0.000011,75.55,0.49,-0.63,...,,,,,,,,,,
3,1981-01-01,"(-4.75, -38.5)",34.91,-4.331980e-06,73.19,-0.02,0.000014,24.54,0.49,-0.63,...,,,,,,,,,,
4,1981-01-01,"(-4.5, -39.0)",60.35,-2.912150e-06,74.28,0.07,0.000006,81.54,0.49,-0.63,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96325,2022-02-01,"(-5.25, -40.25)",12.81,7.554250e-07,78.69,-0.04,0.000015,97.69,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65
96326,2022-02-01,"(-5.0, -40.75)",27.71,1.383160e-06,79.00,-0.13,0.000006,-135.32,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65
96327,2022-02-01,"(-5.0, -40.5)",24.11,-1.849470e-07,79.46,-0.09,-0.000002,120.67,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65
96328,2022-02-01,"(-5.0, -40.25)",16.98,2.199350e-06,78.71,-0.05,0.000014,292.69,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65


In [98]:
df = df_original.assign(
    lat = df_original.posicao.apply(lambda x: eval(x)[0]),
    lon = df_original.posicao.apply(lambda x: eval(x)[1]),
    ano = df_original.data.apply(lambda x: int(x[:4])),
    mes = df_original.data.apply(lambda x: int(x[5:7]))
).drop(columns=["posicao","regiao_hidro"], axis=1)

In [99]:
pd.to_datetime(df['data'])
df = df.set_index(['data'])
df = df['1981-01':'2022-01']

In [100]:
df.tail()

Unnamed: 0_level_0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01,25.01,-9.04746e-07,69.77,0.0,1.7e-05,153.35,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.25,-40.25,2021,12
2021-12-01,46.2,-8.48549e-08,70.93,-0.17,4e-06,-91.65,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.0,-40.75,2021,12
2021-12-01,34.76,-1.22479e-06,70.94,-0.13,6e-06,-7.64,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.0,-40.5,2021,12
2021-12-01,24.72,4.1437e-07,70.24,-0.04,1.9e-05,271.37,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.0,-40.25,2021,12
2021-12-01,51.99,-2.34375e-07,71.31,-0.12,3e-06,109.36,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-4.75,-40.75,2021,12


In [101]:
df

Unnamed: 0_level_0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-01-01,68.92,-2.849130e-06,74.14,-0.01,0.000007,93.54,0.49,-0.63,0.28,-0.22,...,,,,,,,-4.75,-39.25,1981,1
1981-01-01,59.98,-3.877240e-06,73.94,-0.02,0.000008,120.53,0.49,-0.63,0.28,-0.22,...,,,,,,,-4.75,-39.00,1981,1
1981-01-01,54.32,-4.451220e-06,73.52,-0.03,0.000011,75.55,0.49,-0.63,0.28,-0.22,...,,,,,,,-4.75,-38.75,1981,1
1981-01-01,34.91,-4.331980e-06,73.19,-0.02,0.000014,24.54,0.49,-0.63,0.28,-0.22,...,,,,,,,-4.75,-38.50,1981,1
1981-01-01,60.35,-2.912150e-06,74.28,0.07,0.000006,81.54,0.49,-0.63,0.28,-0.22,...,,,,,,,-4.50,-39.00,1981,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-01,25.01,-9.047460e-07,69.77,0.00,0.000017,153.35,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.25,-40.25,2021,12
2021-12-01,46.20,-8.485490e-08,70.93,-0.17,0.000004,-91.65,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.75,2021,12
2021-12-01,34.76,-1.224790e-06,70.94,-0.13,0.000006,-7.64,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.50,2021,12
2021-12-01,24.72,4.143700e-07,70.24,-0.04,0.000019,271.37,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.25,2021,12


In [104]:
df.isnull().sum().sort_values(ascending=False)

pr                0
nino3_menos_1     0
ano               0
lon               0
lat               0
atl3_menos_4      0
nino3_menos_4     0
EMI_menos_4       0
atl3_menos_3      0
nino3_menos_3     0
EMI_menos_3       0
atl3_menos_2      0
nino3_menos_2     0
EMI_menos_2       0
atl3_menos_1      0
EMI_menos_1       0
divergencia       0
pr_mais_2         0
nesta             0
seta              0
atl3              0
atlgrad           0
ats               0
atn               0
nino3             0
EMI               0
fluxo_energia     0
vorticidade       0
vento_vertical    0
umidade           0
mes               0
dtype: int64

In [103]:
df.dropna(inplace=True)

In [105]:
X = df.drop(columns=["pr_mais_2"], axis=1)
y = df[["pr_mais_2"]]

In [106]:
X.shape

(95160, 30)

In [107]:
X

Unnamed: 0_level_0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-05-01,40.41,-3.356390e-06,85.50,-0.10,0.000006,51.78,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-39.25,1981,5
1981-05-01,51.94,-2.265260e-06,84.89,-0.11,0.000005,72.78,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-39.00,1981,5
1981-05-01,50.19,-1.520130e-06,84.39,-0.12,0.000004,82.79,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-38.75,1981,5
1981-05-01,47.31,-2.272670e-06,83.94,-0.08,0.000003,15.77,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-38.50,1981,5
1981-05-01,90.20,-1.694980e-06,85.09,-0.04,0.000004,22.76,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.50,-39.00,1981,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-01,25.01,-9.047460e-07,69.77,0.00,0.000017,153.35,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.25,-40.25,2021,12
2021-12-01,46.20,-8.485490e-08,70.93,-0.17,0.000004,-91.65,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.75,2021,12
2021-12-01,34.76,-1.224790e-06,70.94,-0.13,0.000006,-7.64,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.50,2021,12
2021-12-01,24.72,4.143700e-07,70.24,-0.04,0.000019,271.37,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.25,2021,12


## Treinando os Modelos

In [108]:
n_splits = 30
models = {
    'Linear': LinearRegression(),
    'KNN': KNeighborsRegressor(n_neighbors=3),
    'Árvore de Decisão': DecisionTreeRegressor(),
    'Xgboost': XGBRegressor()
}


preprocessing = ColumnTransformer([
    ("ohe", OneHotEncoder(), ["mes"]),
    ("scaler", StandardScaler(), X.columns.to_list()[:-1])
])


scores = {}
for model_name, model in models.items():
    model_score = cross_validate(
        estimator = Pipeline(steps=[
            ("preprocessing", preprocessing),
            ("model", model)
        ]),
        X = X,
        y = y,
        cv = ShuffleSplit(n_splits=n_splits, test_size=.2),
        scoring = [
            'neg_mean_absolute_error',
            'neg_mean_squared_error',
            'neg_root_mean_squared_error',
            'r2'
        ]
    )
    if not(scores):
        scores = {key: [] for key in model_score}
        scores['model'] = []
    for key in model_score:
        scores[key].extend(model_score[key])
    scores['model'].extend([model_name] * n_splits)

## Avaliando os modelos

In [109]:
# modelo com janela de meses passados v2 somente com 'EMI', 'nino3', 'atl3'
pd.DataFrame(scores).groupby('model').agg([np.mean, np.std]).transpose()

Unnamed: 0,model,KNN,Linear,Xgboost,Árvore de Decisão
fit_time,mean,0.051635,0.105706,2.210214,1.298823
fit_time,std,0.005102,0.005685,0.388562,0.090342
score_time,mean,23.377637,0.014174,0.019532,0.014743
score_time,std,0.369601,0.003645,0.00322,0.002545
test_neg_mean_absolute_error,mean,-12.16735,-33.523725,-13.226714,-14.135489
test_neg_mean_absolute_error,std,0.150199,0.249098,0.157485,0.187028
test_neg_mean_squared_error,mean,-535.700042,-2668.184202,-481.609077,-731.85625
test_neg_mean_squared_error,std,17.756227,53.801227,13.459012,24.820929
test_neg_root_mean_squared_error,mean,-23.142147,-51.65194,-21.943515,-27.049106
test_neg_root_mean_squared_error,std,0.382005,0.519927,0.307194,0.457232


In [95]:
# modelo com janela de meses passados
pd.DataFrame(scores).groupby('model').agg([np.mean, np.std]).transpose()

Unnamed: 0,model,KNN,Linear,Xgboost,Árvore de Decisão
fit_time,mean,0.102182,0.327648,3.528511,3.826715
fit_time,std,0.005776,0.023031,0.023661,0.051908
score_time,mean,23.500015,0.018646,0.020298,0.019162
score_time,std,0.180113,0.002106,0.000913,0.000706
test_neg_mean_absolute_error,mean,-13.690725,-33.163224,-13.818304,-15.479646
test_neg_mean_absolute_error,std,0.132121,0.203619,0.151301,0.195222
test_neg_mean_squared_error,mean,-708.999324,-2522.993032,-522.93537,-883.521332
test_neg_mean_squared_error,std,18.816811,40.938429,14.486197,30.649346
test_neg_root_mean_squared_error,mean,-26.624773,-50.227807,-22.865662,-29.719749
test_neg_root_mean_squared_error,std,0.353512,0.407442,0.316527,0.516476


In [68]:
# modelo sem janela de meses passados
pd.DataFrame(scores).groupby('model').agg([np.mean, np.std]).transpose()

Unnamed: 0,model,KNN,Linear,Xgboost,Árvore de Decisão
fit_time,mean,0.032357,0.070914,1.671338,0.957515
fit_time,std,0.004816,0.004672,0.106544,0.018175
score_time,mean,24.666946,0.011212,0.015052,0.012929
score_time,std,0.302454,0.003271,0.001261,0.001149
test_neg_mean_absolute_error,mean,-12.483224,-33.983933,-13.533248,-14.316866
test_neg_mean_absolute_error,std,0.144579,0.199057,0.184104,0.197059
test_neg_mean_squared_error,mean,-568.018424,-2768.917152,-503.148778,-749.145202
test_neg_mean_squared_error,std,19.412028,41.29362,16.509854,24.758997
test_neg_root_mean_squared_error,mean,-23.829779,-52.619094,-22.428066,-27.366869
test_neg_root_mean_squared_error,std,0.406916,0.391476,0.367637,0.454485
