In [34]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor
)
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, r2_score

from tensorflow.keras import Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

In [None]:
def predicao_por_ml(df: pd.DataFrame,
                    col_de_treino: list[str],
                    var_de_predicao: str,
                    anos_X: int):

    # Definição dos modelos
    models = [

        ("ExtraTrees", ExtraTreesRegressor(
            n_estimators=15,
            max_depth=20,
            max_features=2,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=7
        )),

        ("RandomForest", RandomForestRegressor(
            n_estimators=15,
            max_depth=25,
            max_features=2,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=7
        )),

        ("GradientBoosting", GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.8,
            random_state=7
        )),

        ("KNeighbors", KNeighborsRegressor(
            n_neighbors=7,
            weights='distance',
            algorithm='auto'
        )),

        ("LinearRegression", LinearRegression(
            fit_intercept=True,
            positive=False
        ))

    ]

    best_model = ['', 0]

    for model in models:

        # Lista de métricas por modelo
        r2_list, rmse_list = [], []

        for ano_X in anos_X:

            df_treino = df[df['ano'] <= ano_X].copy()
            df_teste  = df[df['ano']  > ano_X].copy()

            X_train = df_treino[col_de_treino]
            y_train = df_treino[var_de_predicao]

            X_test = df_teste[col_de_treino]
            y_test = df_teste[var_de_predicao]

            model[1].fit(X_train, y_train)
            y_pred = model[1].predict(X_test)

            r2_list.append(r2_score(y_test, y_pred))  # R²
            rmse_list.append(np.sqrt(mean_squared_error(y_test, y_pred)))  # RMSE

        if np.mean(r2_list) >= best_model[1]:
            best_model[0] = model[1]
            best_model[1] = np.mean(r2_list)

        print(f"{model[0][:3]} \t Média R²: {np.mean(r2_list):.4f} \t Média RMSE: {np.mean(rmse_list):.4f}")

    return best_model[0]

In [31]:
# Abrindo base de dados para predição
df_aesa_to_cnrm_cm6_1hr = pd.read_csv(f'../datas/interim/4.3.3_finish_downscaling_database/aesa_to_cnrm_cm6_1hr_sum_downscaling_complete.csv')

# Colunas X e y
X_col, y_col = ['pr', 'pr_acum_6m', 'pr_mes_anterior', 'cluster', 'ano', 'mes', 'lat', 'lon'], "pr_local"

# Definindo ano que separará o treino e a predição
anos_X = [2016, 2017, 2018, 2019, 2020, 2021, 2022]

# Supondo que sua função de predição já esteja definida:
model = predicao_por_ml(df_aesa_to_cnrm_cm6_1hr[df_aesa_to_cnrm_cm6_1hr['ano'] <= 2023], X_col, y_col, anos_X)

model

Ext 	 Média R²: 0.5878 	 Média RMSE: 56.6285
Ran 	 Média R²: 0.5721 	 Média RMSE: 57.6727
Gra 	 Média R²: 0.5483 	 Média RMSE: 59.0816
KNe 	 Média R²: 0.4443 	 Média RMSE: 65.4520
Lin 	 Média R²: 0.3621 	 Média RMSE: 70.4603


In [None]:
def predicao_por_cnn(df: pd.DataFrame,
                     col_de_treino: list[str],
                     var_de_predicao: str,
                     anos_X: int,
                     seed=58):

    # Ficando seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    r2_list, rmse_list = [], []

    for ano_X in anos_X:

        # Separar dados em treino e teste
        df_treino = df[df['ano'] <= ano_X].copy()
        df_teste  = df[df['ano']  > ano_X].copy()

        X_train = df_treino[col_de_treino].values
        y_train = df_treino[var_de_predicao].values

        X_test = df_teste[col_de_treino].values
        y_test = df_teste[var_de_predicao].values

        # Normalização
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Redimensionar para 3D: (samples, timesteps=1, features)
        X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

        # Limpar sessão anterior (importante em loops com Keras)Dropout
        clear_session()

        # Criando modelo CNN
        cnn = Sequential([
            Input(shape=(X_train.shape[1], X_train.shape[2])),
            Conv1D(64, kernel_size=3, padding='same', activation='relu'),
            MaxPooling1D(1),
            Flatten(),
            Dense(128, activation='relu'),
            Dense(64, activation='relu'),
            Dense(1)
        ])

        cnn.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])

        # Treinamento
        cnn.fit(X_train, y_train, epochs=5, validation_split=0.2, verbose=0)

        # Previsão e avaliação
        y_pred = cnn.predict(X_test).flatten()

        r2_list.append(r2_score(y_test, y_pred))
        rmse_list.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"\nCNN \t Média R²: {np.mean(r2_list):.4f} \t Média RMSE: {np.mean(rmse_list):.4f} \t SEED: {seed}")

    return cnn

In [33]:
# Abrindo base de dados para predição
df_aesa_to_cnrm_cm6_1hr = pd.read_csv(f'../datas/interim/4.3.3_finish_downscaling_database/aesa_to_cnrm_cm6_1hr_sum_downscaling_complete.csv')

# Colunas X e y
X_col, y_col = ['pr', 'pr_acum_6m', 'pr_mes_anterior', 'cluster', 'ano', 'mes', 'lat', 'lon'], "pr_local"

# Definindo ano que separará o treino e a predição
anos_X = [2016, 2017, 2018, 2019, 2020, 2021, 2022]

model = predicao_por_cnn(df_aesa_to_cnrm_cm6_1hr[df_aesa_to_cnrm_cm6_1hr['ano'] <= 2023], X_col, y_col, anos_X, 58)

13800 1 1
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000us/step
14400 1 1
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
15000 1 1
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
15600 1 1
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 829us/step
16200 1 1
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
16800 1 1
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
17400 1 1
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

CNN 	 Média R²: 0.5820 	 Média RMSE: 57.0266 	 SEED: 58


In [38]:
def predicao_por_mlp(df: pd.DataFrame,
                     col_de_treino: list[str],
                     var_de_predicao: str,
                     anos_X: int,
                     seed=58):

    # Ficando seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    r2_list, rmse_list = [], []

    for ano_X in anos_X:

        # Separar dados em treino e teste
        df_treino = df[df['ano'] <= ano_X].copy()
        df_teste  = df[df['ano']  > ano_X].copy()

        X_train = df_treino[col_de_treino].values
        y_train = df_treino[var_de_predicao].values

        X_test = df_teste[col_de_treino].values
        y_test = df_teste[var_de_predicao].values

        # Normalização
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Limpar sessão anterior (importante em loops com Keras)Dropout
        clear_session()

        # Criando modelo CNN
        mlp = Sequential([
            Input(shape=(X_train.shape[1],)),
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dense(1)
        ])

        mlp.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])

        # Treinamento
        mlp.fit(X_train, y_train, epochs=5, validation_split=0.2, verbose=0)

        # Previsão e avaliação
        y_pred = mlp.predict(X_test).flatten()

        r2_list.append(r2_score(y_test, y_pred))
        rmse_list.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"\nMLP \t Média R²: {np.mean(r2_list):.4f} \t Média RMSE: {np.mean(rmse_list):.4f} \t SEED: {seed}")

    return mlp

In [39]:
# Abrindo base de dados para predição
df_aesa_to_cnrm_cm6_1hr = pd.read_csv(f'../datas/interim/4.3.3_finish_downscaling_database/aesa_to_cnrm_cm6_1hr_sum_downscaling_complete.csv')

# Colunas X e y
X_col, y_col = ['pr', 'pr_acum_6m', 'pr_mes_anterior', 'cluster', 'ano', 'mes', 'lat', 'lon'], "pr_local"

# Definindo ano que separará o treino e a predição
anos_X = [2016, 2017, 2018, 2019, 2020, 2021, 2022]

model = predicao_por_mlp(df_aesa_to_cnrm_cm6_1hr[df_aesa_to_cnrm_cm6_1hr['ano'] <= 2023], X_col, y_col, anos_X, 58)

[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 843us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

MLP 	 Média R²: 0.5400 	 Média RMSE: 59.8265 	 SEED: 58
