In [1]:
import os
import re
import json
import torch
from torch import nn
import lightning.pytorch as L  # Novo estilo unificado
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import gc
import os
import logging
import re
from sklearn.model_selection import train_test_split, cross_val_score, KFold
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
base_path = "/home/jupyter-jphuser2"  # ajuste aqui
output_plot_dir = os.path.join(base_path, "plots_full")
os.makedirs(output_plot_dir, exist_ok=True)
best_checkpoints = []


In [3]:
# Coleta melhores checkpoints por configuração
for folder in os.listdir(base_path):
    if folder.startswith("logs_"):
        feature_selection = folder.replace("logs_", "")
        folder_path = os.path.join(base_path, folder)
        for config in os.listdir(folder_path):
            if config.startswith("checkpoints_") and os.path.isdir(os.path.join(folder_path, config)):
                best_loss = float('inf')
                best_ckpt = None
                for file in os.listdir(os.path.join(folder_path, config)):
                    match = re.search(r"val_loss=(\d+(?:\.\d+)?)", file)
                    if match:
                        val_loss = float(match.group(1))
                        if val_loss < best_loss:
                            best_loss = val_loss
                            best_ckpt = os.path.join(folder_path, config, file)
                if best_ckpt:
                    parts = config.replace("checkpoints_", "").split("_")
                    best_checkpoints.append({
                        'feature_selection': feature_selection,
                        'optimizer': parts[0],
                        'loss': parts[1],
                        'activation': '_'.join(parts[2:]),
                        'val_loss': best_loss,
                        'checkpoint_path': best_ckpt
                    })


NotADirectoryError: [Errno 20] Not a directory: '/home/jupyter-jphuser2/logs_and_plots.zip'

In [4]:
# Definição do modelo Lightning (execute antes de gerar plots)
import torch
from torch import nn
import lightning.pytorch as L

class LightningModel(L.LightningModule):
    def __init__(self, num_features, activation="ReLU", optimizer_name="Adam", loss_name="MSE", dropout_rate=0.3):
        super().__init__()
        self.num_features = num_features
        self.activation = activation
        self.optimizer_name = optimizer_name
        self.loss_name = loss_name
        self.dropout_rate = dropout_rate
        self.model = self.build_model()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        z = self(x)
        loss = self.get_loss_function()(z, y.unsqueeze(1))
        self.log("train_loss", loss, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        z = self(x)
        loss = self.get_loss_function()(z, y.unsqueeze(1))
        self.log("val_loss", loss, on_step=False, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return self.get_optimizer()

    def get_optimizer(self):
        optimizers = {
            "Adam": torch.optim.Adam(self.parameters(), lr=0.001),
            "AdamW": torch.optim.AdamW(self.parameters(), lr=0.001),
        }
        return optimizers[self.optimizer_name]

    def get_loss_function(self):
        loss_functions = {
            "MSE": nn.MSELoss(),
            "SmoothL1Loss": nn.SmoothL1Loss(),
            "MAE": nn.L1Loss()
        }
        return loss_functions[self.loss_name]

    def get_activation(self):
        activations = {
            "ReLU": nn.ReLU(),
            "PReLU": nn.PReLU(),
            "ELU": nn.ELU(),
            "LeakyReLU": nn.LeakyReLU()
        }
        return activations[self.activation]

    def build_model(self):
        hidden_size = self.num_features // 2
        hidden_size2 = hidden_size // 2
        return nn.Sequential(
            nn.Linear(self.num_features, hidden_size),
            self.get_activation(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size, hidden_size2),
            self.get_activation(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size2, 1)
        )


In [5]:
# %% [markdown]
# # Cálculo de MAPE para cada modelo

# %% [code]
import os
import gc
import json
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Configurações gerais
parquet_file    = 'data (2).parquet'
target_column   = 'perf'
output_mape_csv = 'mape_summary.csv'
output_plot_dir = 'plots'
device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# %% [code]
# Funções auxiliares
def json_to_dataframe(path):
    with open(path, 'r', encoding='utf-8') as f:
        dados = json.load(f)
    return pd.DataFrame(list(dados.values()), columns=['feature'])

def select_top_features(df_imp, pct):
    df_sorted = df_imp.sort_values(by='importance', ascending=False)
    top_n = int(len(df_sorted) * pct)
    return df_sorted['feature'].iloc[:top_n].tolist()

# Carrega apenas uma vez todos os dataframes de importâncias
importance_data = {
    'DecisionTree':     pd.read_csv('feature_importances_DecisionTree.csv'),
    'RandomForest':     pd.read_csv('feature_importances_RandomForest.csv'),
    'GradientBoosting': pd.read_csv('feature_importances_GradientBoosting.csv'),
    'XGBoost':          pd.read_csv('feature_importances_XGBoost.csv'),
    'Word2Vec':         json_to_dataframe('data_features_w2v.json')
}

# %% [code]
# Loop principal: gera predições, calcula MAPE e opcionalmente salva um scatter plot
mape_records = []

for ckpt in best_checkpoints:
    # --- Seleção de colunas ---
    fs = ckpt['feature_selection'].split('_')
    algo, pct_label = fs[0], fs[1]
    if algo == 'allFeatures':
        cols = None
    else:
        pct = int(pct_label.replace('pct','')) / 100
        df_imp = importance_data[algo]
        if algo == 'Word2Vec':
            selected = df_imp['feature'].tolist()[:int(len(df_imp)*pct)]
        else:
            selected = select_top_features(df_imp, pct)
        cols = selected + [target_column]

    # --- Carrega apenas colunas necessárias e divide em treino/teste ---
    df = pd.read_parquet(parquet_file, columns=cols)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    del df; gc.collect()

    _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    del X, y; gc.collect()

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)
    del X_test, y_test; gc.collect()

    # --- Carrega o modelo e faz predição ---
    model = LightningModel.load_from_checkpoint(
        ckpt['checkpoint_path'],
        num_features=X_test_tensor.shape[1],
        activation=ckpt['activation'],
        optimizer_name=ckpt['optimizer'],
        loss_name=ckpt['loss']
    ).to(device)
    model.eval()
    with torch.no_grad():
        preds = model(X_test_tensor).cpu().squeeze().numpy()
    y_true = y_test_tensor.cpu().numpy()

    # --- Cálculo do MAPE ---
    eps = 1e-8
    mape = np.mean(np.abs((y_true - preds) / (y_true + eps))) * 100

    # Armazena o resultado
    mape_records.append({
        "Feature_Selection":   ckpt['feature_selection'],
        "Optimizer":           ckpt['optimizer'],
        "Loss_Function":       ckpt['loss'],
        "Activation_Function": ckpt['activation'],
        "MAPE(%)":             mape
    })

    # (Opcional) salva scatter real vs predito
    plt.figure(figsize=(10, 6))
    plt.plot(y_test_tensor.cpu().numpy(), label='Valores Reais')
    plt.plot(preds[:100], linestyle='dashed', label='Predições')
    plt.xlabel('Índice')
    plt.ylabel('Perf')
    plt.title(f"{ckpt['feature_selection']} | {ckpt['optimizer']}_{ckpt['loss']}_{ckpt['activation']}")
    plt.legend()

    # salva e libera
    fname = f"{ckpt['feature_selection']}_{ckpt['optimizer']}_{ckpt['loss']}_{ckpt['activation']}.png"
    plt.savefig(os.path.join(output_plot_dir, fname))
    plt.close()

    del X_test_tensor, y_test_tensor, preds, model
    gc.collect()

# %% [code]
# Salva o summary de MAPE em CSV
mape_df = pd.DataFrame(mape_records)
mape_df.to_csv(output_mape_csv, index=False)
print(f"Resumo de MAPE salvo em: {output_mape_csv}")


RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.