In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import numpy as np

In [2]:
df_original = pd.read_csv("../data/processed/data_regiao_hidro.csv")

In [3]:
df = df_original.assign(
    lat = df_original.posicao.apply(lambda x: eval(x)[0]),
    lon = df_original.posicao.apply(lambda x: eval(x)[1]),
    ano = df_original.data.apply(lambda x: int(x[:4])),
    mes = df_original.data.apply(lambda x: int(x[5:7]))
).drop(columns=["posicao", "data", "regiao_hidro"], axis=1)

In [4]:
df

Unnamed: 0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,atlgrad,atl3,seta,nesta,lat,lon,ano,mes
0,68.92,-2.849130e-06,74.14,-0.01,0.000007,93.54,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-39.25,1981,1
1,59.98,-3.877240e-06,73.94,-0.02,0.000008,120.53,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-39.00,1981,1
2,54.32,-4.451220e-06,73.52,-0.03,0.000011,75.55,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-38.75,1981,1
3,34.91,-4.331980e-06,73.19,-0.02,0.000014,24.54,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-38.50,1981,1
4,60.35,-2.912150e-06,74.28,0.07,0.000006,81.54,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.50,-39.00,1981,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96325,12.81,7.554250e-07,78.69,-0.04,0.000015,97.69,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.25,-40.25,2022,2
96326,27.71,1.383160e-06,79.00,-0.13,0.000006,-135.32,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.00,-40.75,2022,2
96327,24.11,-1.849470e-07,79.46,-0.09,-0.000002,120.67,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.00,-40.50,2022,2
96328,16.98,2.199350e-06,78.71,-0.05,0.000014,292.69,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.00,-40.25,2022,2


In [5]:
X = df.drop(columns=["pr"], axis=1)
y = df[["pr"]]

In [6]:
X

Unnamed: 0,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,atlgrad,atl3,seta,nesta,lat,lon,ano,mes
0,-2.849130e-06,74.14,-0.01,0.000007,93.54,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-39.25,1981,1
1,-3.877240e-06,73.94,-0.02,0.000008,120.53,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-39.00,1981,1
2,-4.451220e-06,73.52,-0.03,0.000011,75.55,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-38.75,1981,1
3,-4.331980e-06,73.19,-0.02,0.000014,24.54,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.75,-38.50,1981,1
4,-2.912150e-06,74.28,0.07,0.000006,81.54,0.49,-0.63,0.28,-0.22,0.50,-0.32,-0.21,0.14,-4.50,-39.00,1981,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96325,7.554250e-07,78.69,-0.04,0.000015,97.69,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.25,-40.25,2022,2
96326,1.383160e-06,79.00,-0.13,0.000006,-135.32,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.00,-40.75,2022,2
96327,-1.849470e-07,79.46,-0.09,-0.000002,120.67,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.00,-40.50,2022,2
96328,2.199350e-06,78.71,-0.05,0.000014,292.69,0.04,-1.17,0.45,0.40,0.05,0.12,0.35,0.51,-5.00,-40.25,2022,2


In [7]:
X.shape

(96330, 17)

In [33]:
n_splits = 30
models = {
    'linear': LinearRegression(),
    'arvore de decisão': DecisionTreeRegressor()
}



preprocessing = ColumnTransformer([
    ("ohe", OneHotEncoder(), ["mes"]),
    ("scaler", StandardScaler(), X.columns.to_list()[:-1])
])


scores = {}
for model_name, model in models.items():
    model_score = cross_validate(
        estimator = Pipeline(steps=[
            ("preprocessing", preprocessing),
            ("model", model)
        ]),
        X = X,
        y = y,
        cv = ShuffleSplit(n_splits=n_splits, test_size=.2),
        scoring = [
            'neg_mean_absolute_error',
            'neg_mean_squared_error',
            'neg_root_mean_squared_error',
            'r2'
        ]
    )
    if not(scores):
        scores = {key: [] for key in model_score}
        scores['model'] = []
    for key in model_score:
        scores[key].extend(model_score[key])
    scores['model'].extend([model_name] * n_splits)

In [32]:
pd.DataFrame(scores).groupby('model').agg([np.mean, np.std]).transpose()

Unnamed: 0,model,arvore de decisão,linear
fit_time,mean,1.960978,0.219865
fit_time,std,0.132073,0.087411
score_time,mean,0.031119,0.03563
score_time,std,0.006764,0.006269
test_neg_mean_absolute_error,mean,-15.948306,-32.508062
test_neg_mean_absolute_error,std,0.192847,0.126049
test_neg_mean_squared_error,mean,-971.311143,-2280.727459
test_neg_mean_squared_error,std,21.111549,33.030641
test_neg_root_mean_squared_error,mean,-31.164641,-47.756125
test_neg_root_mean_squared_error,std,0.338346,0.346274


In [9]:
pd.DataFrame(scores).agg([np.mean, np.std])

Unnamed: 0,linear,arvore de decisão
mean,-2275.493889,-921.209359
std,3.921929,20.777263


In [None]:
pd.DataFrame(scores).agg([np.mean, np.std])

Unnamed: 0,linear,arvore de decisão
mean,-3292.874923,-899.408542
std,53.171592,9.789537
