# Comparative Analysis

## Importando bibliotecas

In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_validate, ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder,
    LabelEncoder, SplineTransformer, OrdinalEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Tratamento dos Dados

#### Como nosso objetivo é predizer a precipitação mensal para as Regiões Hidrográficas do Estado do Ceará, precisamos que nosso modelo consiga fazer a predição da precipitação futura. <br><br> Para tal, foi criada uma função que cria uma janela e para cada ponto (lat,lon), adiciona a respectiva precipitação 2 meses à frente.  <br><br>  Além disso, avaliamos a influência das variáveis preditoras (índices oceânicos e variáveis atmosféricas) nos 4 meses anteriores, a fim de buscar a melhor forma de predição. 

In [None]:
def get_future(df, columns, janela):
    """
    A função pega a base de dados, e para cada ponto (lat,lon), adiciona a respectiva precipitação 2 meses à frente e algumas variáveis nos últimos 4 meses

    """
    suffix = 'mais' if janela > 0 else 'menos'
    df_out = df.copy()
    new_columns = [f'{variavel}_{suffix}_{abs(janela)}' for variavel in columns]
    for posicao in df.posicao.unique():
        criteria = "posicao == @posicao"
        df_out.loc[df_out.eval(criteria), new_columns] = (
            df_out
            .query(criteria)
            .shift(periods=-janela)[columns].values
        )
    return df_out

important_columns = [
    'EMI', 'nino3', 'atl3' 
]
df_original = (
    pd
    .read_csv("/content/drive/MyDrive/Pyoneers/data_regiao_hidro.csv")
    .pipe(get_future, ['pr'], 2)
    .pipe(get_future, important_columns, -1)
    .pipe(get_future, important_columns, -2)
    .pipe(get_future, important_columns, -3)
    .pipe(get_future, important_columns, -4)
    #.sample(1000, random_state=42) #remover depois
)

In [None]:
df_original

Unnamed: 0,data,posicao,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,...,atl3_menos_1,EMI_menos_2,nino3_menos_2,atl3_menos_2,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4
0,1981-01-01,"(-4.75, -39.25)",68.92,-2.849130e-06,74.14,-0.01,0.000007,93.54,0.49,-0.63,...,,,,,,,,,,
1,1981-01-01,"(-4.75, -39.0)",59.98,-3.877240e-06,73.94,-0.02,0.000008,120.53,0.49,-0.63,...,,,,,,,,,,
2,1981-01-01,"(-4.75, -38.75)",54.32,-4.451220e-06,73.52,-0.03,0.000011,75.55,0.49,-0.63,...,,,,,,,,,,
3,1981-01-01,"(-4.75, -38.5)",34.91,-4.331980e-06,73.19,-0.02,0.000014,24.54,0.49,-0.63,...,,,,,,,,,,
4,1981-01-01,"(-4.5, -39.0)",60.35,-2.912150e-06,74.28,0.07,0.000006,81.54,0.49,-0.63,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96325,2022-02-01,"(-5.25, -40.25)",12.81,7.554250e-07,78.69,-0.04,0.000015,97.69,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65
96326,2022-02-01,"(-5.0, -40.75)",27.71,1.383160e-06,79.00,-0.13,0.000006,-135.32,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65
96327,2022-02-01,"(-5.0, -40.5)",24.11,-1.849470e-07,79.46,-0.09,-0.000002,120.67,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65
96328,2022-02-01,"(-5.0, -40.25)",16.98,2.199350e-06,78.71,-0.05,0.000014,292.69,0.04,-1.17,...,0.23,-0.41,-1.28,0.23,-0.77,-1.03,0.55,-0.77,-0.79,0.65


In [None]:
df = df_original.assign(
    lat = df_original.posicao.apply(lambda x: eval(x)[0]),
    lon = df_original.posicao.apply(lambda x: eval(x)[1]),
    ano = df_original.data.apply(lambda x: int(x[:4])),
    mes = df_original.data.apply(lambda x: int(x[5:7]))
).drop(columns=["data","posicao","regiao_hidro"], axis=1).dropna()

In [None]:
df

Unnamed: 0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
780,40.41,-3.356390e-06,85.50,-0.10,0.000006,51.78,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-39.25,1981,5
781,51.94,-2.265260e-06,84.89,-0.11,0.000005,72.78,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-39.00,1981,5
782,50.19,-1.520130e-06,84.39,-0.12,0.000004,82.79,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-38.75,1981,5
783,47.31,-2.272670e-06,83.94,-0.08,0.000003,15.77,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-38.50,1981,5
784,90.20,-1.694980e-06,85.09,-0.04,0.000004,22.76,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.50,-39.00,1981,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95935,25.01,-9.047460e-07,69.77,0.00,0.000017,153.35,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.25,-40.25,2021,12
95936,46.20,-8.485490e-08,70.93,-0.17,0.000004,-91.65,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.75,2021,12
95937,34.76,-1.224790e-06,70.94,-0.13,0.000006,-7.64,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.50,2021,12
95938,24.72,4.143700e-07,70.24,-0.04,0.000019,271.37,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.25,2021,12


In [None]:
df.isnull().sum().sort_values(ascending=False)

pr                0
nino3_menos_1     0
ano               0
lon               0
lat               0
atl3_menos_4      0
nino3_menos_4     0
EMI_menos_4       0
atl3_menos_3      0
nino3_menos_3     0
EMI_menos_3       0
atl3_menos_2      0
nino3_menos_2     0
EMI_menos_2       0
atl3_menos_1      0
EMI_menos_1       0
divergencia       0
pr_mais_2         0
nesta             0
seta              0
atl3              0
atlgrad           0
ats               0
atn               0
nino3             0
EMI               0
fluxo_energia     0
vorticidade       0
vento_vertical    0
umidade           0
mes               0
dtype: int64

In [None]:
target_column = 'pr_mais_2'
#nominal_columns = [column for column in list(df.select_dtypes(object)) if column != target_column]
quantitative_columns = [column for column in list(df.select_dtypes(np.number)) if column != 'pr_mais_2'] 

In [None]:
X = df.drop(columns=[target_column,'pr_mais_2'], axis=1)
y = df[target_column].ravel()

In [None]:
# X = (
#     df
#     .query(f'{target_column}.notna()')
#     .drop([target_column,'pr_mais_2'], axis=1)
# )
# y = (
#     df
#     .query(f'{target_column}.notna()')[[target_column]]
#     .values.ravel()
# )

In [None]:
print(X.shape)
print(y.shape)

(95160, 30)
(95160,)


In [None]:
X

Unnamed: 0,pr,divergencia,umidade,vento_vertical,vorticidade,fluxo_energia,EMI,nino3,atn,ats,...,EMI_menos_3,nino3_menos_3,atl3_menos_3,EMI_menos_4,nino3_menos_4,atl3_menos_4,lat,lon,ano,mes
780,40.41,-3.356390e-06,85.50,-0.10,0.000006,51.78,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-39.25,1981,5
781,51.94,-2.265260e-06,84.89,-0.11,0.000005,72.78,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-39.00,1981,5
782,50.19,-1.520130e-06,84.39,-0.12,0.000004,82.79,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-38.75,1981,5
783,47.31,-2.272670e-06,83.94,-0.08,0.000003,15.77,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.75,-38.50,1981,5
784,90.20,-1.694980e-06,85.09,-0.04,0.000004,22.76,-0.51,-0.21,0.38,-0.33,...,0.26,-0.76,-0.16,0.49,-0.63,-0.32,-4.50,-39.00,1981,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95935,25.01,-9.047460e-07,69.77,0.00,0.000017,153.35,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.25,-40.25,2021,12
95936,46.20,-8.485490e-08,70.93,-0.17,0.000004,-91.65,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.75,2021,12
95937,34.76,-1.224790e-06,70.94,-0.13,0.000006,-7.64,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.50,2021,12
95938,24.72,4.143700e-07,70.24,-0.04,0.000019,271.37,-0.41,-1.28,0.22,0.27,...,-0.59,-0.44,0.92,-0.42,-0.46,1.27,-5.00,-40.25,2021,12


In [None]:
y

array([11.39, 11.02, 11.08, ..., 24.11, 16.98, 50.48])

## Treinando os Modelos

#### Antes do treinamento, os dados passaram por um pré-processamento, onde foram padronizados, de forma a estarem na mesma magnitude, e a sazonalidade foi considerada, por meio da aplicação do método OneHotEncoder na variável "mes". <br><br> Os  modelos escolhidos foram:
- **Regressão Linear**;
- **KNN**;
- **Árvore de Decisão**;
- **Xgboost**.

In [None]:
# nominal_preprocessing = Pipeline([    
#     ("missing", SimpleImputer(strategy='most_frequent')),
#     ("encoder", OneHotEncoder(sparse=False)),
#     ("scaler", StandardScaler())
# ])
quantitative_preprocessing = Pipeline([
    ("missing", SimpleImputer()),
    ("scaler", StandardScaler())
])
preprocessing = ColumnTransformer([
    ("ohe", OneHotEncoder(), ["mes"]),
    #("nominal", nominal_preprocessing, nominal_columns),
    ("quantitative", quantitative_preprocessing, quantitative_columns)
])

In [None]:
models = [{
    'name': 'KNN',
    'model': KNeighborsRegressor(),
    'parameters': {
        'n_neighbors': np.arange(3, 17, 2),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [10, 20, 30, 40]
    }
},{
    'name': 'Linear',
    'model': LinearRegression(),
    'parameters': {
       }
},{
    'name': 'XGBoost',
    'model': XGBRegressor(random_state=42),
    'parameters': {
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators' : [100, 200, 300],
        'max_depth'    : [3, 5, 8],
        'min_child_weight': [1, 5, 10]
    }
},{
    'name': 'Decision Tree',
    'model':  DecisionTreeRegressor(random_state=42),
    'parameters': {
        "min_samples_leaf": np.linspace(0.1, 0.5, 6),
        "max_depth": [3, 5, 8],
        "max_features": ["log2", "sqrt", "auto"],
        "criterion": ["friedman_mse", "squared_error"]
    }
}
]

In [None]:
def concatenate(*args):
    final_dict = {key: [] for key in args[0].keys()}
    for dictionary in args:
        for key, value in dictionary.items():
            final_dict[key].extend(value)
    return final_dict

## Avaliando os modelos

#### As métricas utilizados para avaliar o desempenho dos modelos de regressão foram:

- **MAE** (Erro Absoluto Médio): calcula o "erro absoluto médio" dos erros entre valores observados (reais) e predições (hipóteses). Quanto maior seu valor, pior o desempenho do modelo.
- **MSE** (Erro quadrado médio): Mede o erro ao quadrado médio das previsões do modelo. MSE calcula a diferença ao quadrado entre o resultado observado (real) e os valores previstos e depois calcula a média. Quanto maior seu valor, pior o desempenho do modelo.
- **RMSE** (Raiz do Erro Quadrático Médio): Representa a raiz quadrada do MSE. O RMSE mede a diferença entre os valores previstos pelo modelo e os valores observados (reais). <br> Quanto maior seu valor, pior o desempenho do modelo.
- **R²**: expressa a porcentagem de variância explicada pelas variáveis independentes apresentadas no modelo. Quando maior, melhor o desempenho dele. <br> 

In [None]:
n_splits_cv = 5
n_splits_cv_gs = 5
sc = []
for model in models:
    print(f"running {model['name']}")
    param_grid = {
        'preprocessing__quantitative__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        #'preprocessing__nominal__encoder': [OneHotEncoder(sparse=False), OrdinalEncoder()],
        #'preprocessing__nominal__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
        'preprocessing__quantitative__missing__strategy': ['mean', 'median'],
        **{f"model__{key}": value for key, value in model['parameters'].items()}
    }
    approach = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model['model'])
    ])
    gs = RandomizedSearchCV(
        estimator=approach,
        param_distributions=param_grid,
        scoring='neg_mean_absolute_error',
        cv=n_splits_cv_gs,
        random_state=42
    )
    scores = cross_validate(
        estimator = gs,
        X=X,
        y=y,
        cv = n_splits_cv,
        n_jobs = -1,
        scoring = [
            'neg_mean_absolute_error',
            'neg_mean_squared_error',
            'neg_root_mean_squared_error',
            'r2'
        ]
    )
    scores['model'] = [model['name']] * n_splits_cv
    sc.append(scores)
scores = concatenate(*sc)

running KNN
running Linear
running XGBoost
running Decision Tree


In [None]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(scores)
    .groupby(['model'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    #.hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
print(f'O melhor modelo é o {winner}')

Unnamed: 0,score,Decision Tree,KNN,Linear,XGBoost
0,fit_time,33.659 ± 11.356,4622.694 ± 657.708,20.124 ± 6.285,2668.739 ± 460.713
1,score_time,0.031 ± 0.005,14.973 ± 2.307,0.031 ± 0.005,0.326 ± 0.059
2,test_neg_mean_absolute_error,-35.269 ± 3.443,-35.430 ± 2.635,-35.622 ± 2.492,-31.078 ± 3.409
3,test_neg_mean_squared_error,-3439.702 ± 791.756,-4105.857 ± 548.465,-3004.914 ± 495.726,-2673.011 ± 528.356
4,test_neg_root_mean_squared_error,-58.277 ± 6.597,-63.921 ± 4.472,-54.628 ± 4.545,-51.445 ± 5.145
5,test_r2,0.557 ± 0.039,0.458 ± 0.083,0.606 ± 0.056,0.649 ± 0.072


O melhor modelo é o XGBoost


In [None]:
best_model = next(item for item in models if item["name"] == winner)

param_grid = {
    'preprocessing__quantitative__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
#    'preprocessing__nominal__encoder': [OneHotEncoder(sparse=False), OrdinalEncoder()],
#    'preprocessing__nominal__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'preprocessing__quantitative__missing__strategy': ['mean', 'median'],
    **{f"model__{key}": value for key, value in best_model['parameters'].items()}
}

approach = Pipeline([
    ('preprocessing', preprocessing),
    ('model', best_model['model'])
])

gs = RandomizedSearchCV(
    estimator=approach,
    param_distributions=param_grid,
    scoring='neg_mean_absolute_error',
    cv=n_splits_cv_gs,
    random_state=42
)

gs.fit(X, y)

model = gs.best_estimator_
#joblib.dump(model, '../models/best_model.joblib')



In [None]:
joblib.dump(model, '/content/drive/MyDrive/Pyoneers/best_model.joblib')

['/content/drive/MyDrive/Pyoneers/best_model.joblib']

### Considerações Finais
- Os melhores resultados foram obtidos considerando os índices EMI e nino3 (indices oceanicos relacionados ao El Nino) e o Atl3 (Indice oceanico associado a variabilidade do Atlantico Tropical) dos 4 meses anteriores.
- Dentre os modelos utilizados, o melhor resultado foi obtido pelo modelo Xgboost. Com RMSE próximo a 22 mm e com as variáveis preditoras explicando mais de 93% da variabilidade da precipitação 2 meses no futuro.