In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
x = torch.randn(2, 2).cuda()
print(x)


False


RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [3]:
# Opcionalmente, podemos mostrar informações da GPU
# print("get_ipython().system('nvidia-smi') executado para demonstrar a GPU em uso.")
get_ipython().system('nvidia-smi')

Thu Apr 24 02:55:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15              Driver Version: 570.86.15      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:00:09.0 Off |                  N/A |
| 37%   35C    P8             30W /  370W |      15MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch
from torch import nn
import lightning.pytorch as L  # Novo estilo unificado
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

# Plotagem
import matplotlib.pyplot as plt
import seaborn as sns

# Geral
import numpy as np
import pandas as pd
import gc
import os
import logging

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)
logging.getLogger("lightning.pytorch.callbacks.model_checkpoint").setLevel(logging.ERROR)

In [4]:
# Ajusta a precisão de multiplicações de matriz do PyTorch
torch.set_float32_matmul_precision('medium')

In [5]:
import json

def json_to_dataframe(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        dados_json = json.load(f)
    
    features = list(dados_json.values())
    df = pd.DataFrame(features, columns=["feature"])
    return df

In [6]:
# Função para selecionar as top features (usando top_percentage)
def select_top_features(df_importance: pd.DataFrame, top_percentage=0.7):
    df_sorted = df_importance.sort_values(by='importance', ascending=False)
    top_n = int(len(df_sorted) * top_percentage)
    return df_sorted.head(top_n)['feature'].tolist()

def select_top_features_word2vec(df_importance: pd.DataFrame, top_percentage=0.7):
    top_n = int(len(df_importance) * top_percentage)
    return df_importance.head(top_n)['feature'].tolist()

# Carrega CSVs com importâncias
decision_tree = pd.read_csv('feature_importances_DecisionTree.csv')
random_forest = pd.read_csv('feature_importances_RandomForest.csv')
gradient_boosting = pd.read_csv('feature_importances_GradientBoosting.csv')
xgboost = pd.read_csv('feature_importances_XGBoost.csv')
word2vec = json_to_dataframe('data_features_w2v.json')

# Dicionário contendo as features mais importantes de cada algoritmo
top_features_dict = {
    'DecisionTree': select_top_features(decision_tree, top_percentage=0.7),
    'RandomForest': select_top_features(random_forest, top_percentage=0.7),
    'GradientBoosting': select_top_features(gradient_boosting, top_percentage=0.7),
    'XGBoost': select_top_features(xgboost, top_percentage=0.7),
    'Word2Vec': select_top_features_word2vec(word2vec, top_percentage=0.7)
}

# Também definimos:
parquet_file = "data (2).parquet"
target_column = "perf"

In [7]:
# Verifica se há GPU disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [8]:
# Dataset customizado (igual ao primeiro notebook)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [9]:
class LightningModel(L.LightningModule):
    def __init__(self, num_features, activation="ReLU", optimizer_name="Adam", loss_name="MSE", dropout_rate=0.3):
        super().__init__()
        self.num_features = num_features
        self.activation = activation
        self.optimizer_name = optimizer_name
        self.loss_name = loss_name
        self.dropout_rate = dropout_rate
        self.model = self.build_model()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        z = self(x)
        loss = self.get_loss_function()(z, y.unsqueeze(1))
        self.log("train_loss", loss, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        z = self(x)
        loss = self.get_loss_function()(z, y.unsqueeze(1))
        self.log("val_loss", loss, on_step=False, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return self.get_optimizer()

    def get_optimizer(self):
        optimizers = {
            "Adam": torch.optim.Adam(self.parameters(), lr=0.001),
            "AdamW": torch.optim.AdamW(self.parameters(), lr=0.001),
        }
        return optimizers[self.optimizer_name]

    def get_loss_function(self):
        loss_functions = {
            "MSE": nn.MSELoss(),
            "SmoothL1Loss": nn.SmoothL1Loss(),
            "MAE": nn.L1Loss()
        }
        return loss_functions[self.loss_name]

    def get_activation(self):
        activations = {
            "ReLU": nn.ReLU(),
            "PReLU": nn.PReLU(),
            "ELU": nn.ELU(),
            "LeakyReLU": nn.LeakyReLU()
        }
        return activations[self.activation]

    def build_model(self):
        hidden_size = self.num_features // 2
        hidden_size2 = hidden_size // 2
        return nn.Sequential(
            nn.Linear(self.num_features, hidden_size),
            self.get_activation(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size, hidden_size2),
            self.get_activation(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(hidden_size2, 1)
        )


In [10]:
# Callback de Early Stopping customizado (do primeiro notebook)
import time
import copy
import numpy as np
from lightning.pytorch.callbacks import Callback

class CustomEarlyStopping(Callback):
    def __init__(
        self,
        monitor="val_loss",
        patience_base=10,
        min_delta_factor=0.1,
        max_patience=20,
        window=5,
        mode="min",
    ):
        super().__init__()
        self.monitor = monitor
        self.patience_base = patience_base
        self.min_delta_factor = min_delta_factor
        self.max_patience = max_patience
        self.window = window
        self.mode = mode

        self.best_score = float("inf") if mode == "min" else float("-inf")
        self.best_state_dict = None
        self.wait = 0
        self.loss_history = []

        self.start_time = None
        self.total_time = 0

    def on_train_start(self, trainer, pl_module):
        self.start_time = time.time()

    def on_train_epoch_end(self, trainer, pl_module):
        if self.start_time is not None:
            self.total_time = time.time() - self.start_time
            trainer.logger.log_metrics({"train_time": int(self.total_time)}, step=trainer.current_epoch)

    def on_validation_end(self, trainer, pl_module):
        current_score = trainer.callback_metrics.get(self.monitor)
        if current_score is None:
            return
        current_score = float(current_score)
        self.loss_history.append(current_score)

        if len(self.loss_history) > self.window:
            self.loss_history.pop(0)

        if len(self.loss_history) >= self.window:
            std_dev = np.std(self.loss_history)
            min_delta = self.min_delta_factor * std_dev
            patience_dynamic = (min(self.patience_base + int(std_dev / min_delta), self.max_patience)
                if min_delta != 0 else self.patience_base
            )
        else:
            min_delta = 1e-3
            patience_dynamic = self.patience_base

        improved = False
        if self.mode == "min":
            if (self.best_score - current_score) > min_delta:
                improved = True
        else:
            if (current_score - self.best_score) > min_delta:
                improved = True

        if improved:
            self.best_score = current_score
            self.best_state_dict = copy.deepcopy(pl_module.state_dict())
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= patience_dynamic:
                print(f"[CustomEarlyStopping] Parando no epoch={trainer.current_epoch}. Tempo total: {int(self.total_time)} seg.")
                if self.best_state_dict is not None:
                    pl_module.load_state_dict(self.best_state_dict)
                trainer.should_stop = True

In [11]:
# Função para treinar um modelo dado uma combinação de hiperparâmetros (sem Ray, igual ao primeiro notebook)
def train_model_tune(config, num_features, train_dataset, test_dataset, batch_size=256, max_epochs=50, num_workers=8, logs_dir="logs"):
    config_id = f"{config['optimizer']}_{config['loss_function']}_{config['activation']}"
    csv_logger = CSVLogger(save_dir=logs_dir, name=config_id)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    val_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers
    )

    model = LightningModel(
        num_features=num_features,
        activation=config["activation"],
        optimizer_name=config["optimizer"],
        loss_name=config["loss_function"],
        dropout_rate=config["dropout_rate"]
    )


    early_stop_callback = CustomEarlyStopping(
        monitor="val_loss",
        patience_base=10,
        min_delta_factor=0.4,
        max_patience=20,
        window=5,
        mode="min"
    )

    checkpoint_callback = ModelCheckpoint(
        dirpath=os.path.join(logs_dir, "checkpoints_" + config_id),
        filename="epoch-{epoch:02d}-val_loss-{val_loss:.2f}",
        save_top_k=3,
        verbose=False,
        save_weights_only=True,
        every_n_epochs=1,
        monitor="val_loss",
        mode="min"
    )

    trainer = L.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        callbacks=[early_stop_callback, checkpoint_callback],
        logger=csv_logger,
        enable_progress_bar=True
    )

    trainer.fit(model, train_dataloader, val_dataloader)

In [12]:
import itertools
import os

def run_grid_search(num_features, train_dataset, test_dataset, param_grid,
                    algo_name="FeatureSelection", batch_size=256, max_epochs=50, num_workers=8):
    results = {}
    keys = list(param_grid.keys())
    values = [param_grid[k] for k in keys]

    for config_values in itertools.product(*values):
        config = dict(zip(keys, config_values))

        # Extrai e formata o dropout para compor o nome da pasta
        drop = str(config["dropout_rate"]).replace(".", "")
        logs_dir = f"logs_{algo_name}_{drop}"

        # Garante que a pasta exista
        os.makedirs(logs_dir, exist_ok=True)

        print("\nTreinando com a configuração:", config, "| Algoritmo:", algo_name)

        train_model_tune(
            config=config,
            num_features=num_features,
            train_dataset=train_dataset,
            test_dataset=test_dataset,
            batch_size=batch_size,
            max_epochs=max_epochs,
            num_workers=num_workers,
            logs_dir=logs_dir
        )
        results[str(config)] = "Métricas salvas no CSVLogger em " + logs_dir

    return results


In [13]:
####################################################################################
# RODAMOS A VARIAÇÃO DE PORCENTAGENS DE FEATURES SELECIONADAS PARA CADA ALGORITMO
####################################################################################

param_grid = {
    "optimizer": ["Adam", "AdamW"],
    "loss_function": ["MAE", "MSE", "SmoothL1Loss"],
    "activation": ["ELU", "PReLU", "ReLU", "LeakyReLU"],
    "dropout_rate": [0.3]
}

# Percentuais de features que serão testados
feature_percentages = [0.5]

# Carrega CSVs de importância apenas uma vez
importance_data = {
    # 'DecisionTree': pd.read_csv('feature_importances_DecisionTree.csv'),
    # 'RandomForest': pd.read_csv('feature_importances_RandomForest.csv'),
    # 'GradientBoosting': pd.read_csv('feature_importances_GradientBoosting.csv'),
    'XGBoost': pd.read_csv('feature_importances_XGBoost.csv'),
    # 'Word2Vec': json_to_dataframe('data_features_w2v.json')
}

for pct in feature_percentages:
    pct_label = f"{int(pct * 100)}pct"
    
    for algo_name, df_importance in importance_data.items():
        print(f"=== Iniciando experimento para {algo_name} com {pct_label} das features ===")
        
        # Aplica a função para selecionar features com base no percentual
        if algo_name == 'Word2Vec':
            selected_features = select_top_features_word2vec(df_importance, top_percentage=pct)
        else:
            selected_features = select_top_features(df_importance, top_percentage=pct)
        columns_needed = selected_features + [target_column]

        # Carrega os dados com as colunas necessárias
        df = pd.read_parquet(parquet_file, columns=columns_needed)
        df = df.astype('float32')

        X = df.drop(columns=[target_column])
        y = df[target_column]
        del df
        gc.collect()
        
        # Divide em treino e teste
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        del X, y
        gc.collect()
        
        # Converte para tensores
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
        del X_train, X_test, y_train, y_test
        gc.collect()

        train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
        test_dataset  = CustomDataset(X_test_tensor, y_test_tensor)

        num_features = train_dataset.features.shape[1]

        # Roda o grid search para esse algoritmo + percentual
        results = run_grid_search(
            num_features=num_features,
            train_dataset=train_dataset,
            test_dataset=test_dataset,
            param_grid=param_grid,
            algo_name=f"{algo_name}_{pct_label}",
            batch_size=1024,
            max_epochs=300,
            num_workers=8
        )

        print(f"Resultados do grid search para {algo_name} ({pct_label} features):", results)


=== Iniciando experimento para XGBoost com 50pct das features ===


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Treinando com a configuração: {'optimizer': 'Adam', 'loss_function': 'MAE', 'activation': 'ELU', 'dropout_rate': 0.3} | Algoritmo: XGBoost_50pct


RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [None]:
# --- CÉLULA FINAL: Carregar um modelo treinado a partir de um checkpoint e plotar os resultados ---

# Atualiza a função plot_results para mover os dados para o mesmo dispositivo do modelo
def plot_results(model, dataset, batch_size=1024):
    model.eval()
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_preds = []
    all_true = []
    
    with torch.no_grad():
        for x, y in dataloader:
            # Move x para o mesmo dispositivo do modelo
            x = x.to(next(model.parameters()).device)
            preds = model(x).squeeze()
            all_preds.extend(preds.cpu().numpy())
            all_true.extend(y.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_true = np.array(all_true)
    
    plt.figure(figsize=(8, 6))
    plt.scatter(all_true, all_preds, alpha=0.5)
    plt.xlabel("Valores Reais")
    plt.ylabel("Predições")
    plt.title("Comparação: Predições vs Valores Reais")
    
    min_val = min(all_true.min(), all_preds.min())
    max_val = max(all_true.max(), all_preds.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
    plt.show()

# Supondo que o test_dataset já esteja definido e preparado:
num_features = test_dataset.features.shape[1]

# Defina o caminho para o checkpoint do modelo treinado
best_model_path = "logs_DecisionTree/checkpoints_Adam_MAE_ELU/epoch-epoch=299-val_loss-val_loss=8850943.00.ckpt"

# Carrega o modelo treinado a partir do checkpoint, passando os mesmos parâmetros usados no treinamento
loaded_model = LightningModel.load_from_checkpoint(
    best_model_path,
    num_features=num_features,
    activation="ELU",
    optimizer_name="Adam",
    loss_name="MAE"
)

# Move o modelo para o dispositivo (GPU ou CPU) conforme definido na variável 'device'
loaded_model = loaded_model.to(device)
loaded_model.eval()

# Plota os resultados utilizando o test_dataset
plot_results(loaded_model, test_dataset)


In [None]:
# Gera as predições usando o modelo carregado, movendo o tensor de entrada para o mesmo dispositivo do modelo
pred = loaded_model(X_test_tensor.to(device)).cpu().detach().numpy()

# Converte os valores reais para numpy (usando apenas os primeiros 100 elementos)
y_true = y_test_tensor.cpu().detach().numpy()[:100]

# Plota os valores reais e as predições
plt.figure(figsize=(10,6))
plt.plot(y_true, color='b', label='Valores Reais')
plt.plot(pred[:100], color='r', linestyle='dashed', label='Predições')
plt.title("Comparação: Valores Reais vs Predições (primeiros 100)")
plt.xlabel("Índice")
plt.ylabel("Valor")
plt.legend()
plt.show()
