In [9]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

import torch
import torch.optim as optim


import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors as SklearnNN
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_similarity


from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import json
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle
import random

from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import GATConv
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data

import copy
import time
import statistics
import os

In [2]:
node_features = np.load("../../pre_process/embeddings.npy")
df_sampled = pd.read_pickle("../../pre_process/df_sampled.pkl")

node_features = torch.tensor(node_features, dtype=torch.float32).clone().detach()
print("Amostras no df_sampled:", df_sampled.shape[0])
print("Amostras nos embeddings:", node_features.shape[0])

Amostras no df_sampled: 20757
Amostras nos embeddings: 20757


In [15]:
BERT_MODEL_NAME = "all-MiniLM-L6-v2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PROJECT_NAMES = {
    36: "MULE",
    43: "EVG",
    12: "TIMOB",
    4:  "MESOS"
}
PROJECT_ID = 36 # Projeto de teste MULE(36), EVG(43), TIMOB(12), MESOS(4)
project_name = PROJECT_NAMES.get(PROJECT_ID, f"project_{PROJECT_ID}")
VERSAO_MODELO = f"gat_within_{project_name}"

# Carregar dados
node_features_all = np.load("../../pre_process/embeddings.npy")
df_sampled = pd.read_pickle("../../pre_process/df_sampled.pkl")

# Filtrar apenas o projeto desejado
df_project = df_sampled[df_sampled["Project_ID"] == PROJECT_ID].copy().reset_index(drop=True)
node_features = np.vstack(df_project["BERT_Embedding"])

node_features = torch.tensor(node_features, dtype=torch.float32).clone().detach()

# Criar arestas kNN
k = 4
sknn = SklearnNN(n_neighbors=k, metric="cosine").fit(node_features)
_, indices = sknn.kneighbors(node_features)

edges = []
for i in range(len(df_project)):
    for j in range(1, k):  # Ignora auto-loop
        edges.append([i, indices[i, j]])

edge_index = torch.tensor(edges, dtype=torch.long).T

# Tornar bidirecional
edge_index_bi = torch.cat([edge_index, edge_index.flip(0)], dim=1)

# Remover duplicatas se desejar (opcional, recomendado)
sorted_edge_index = torch.stack([
    torch.min(edge_index_bi[0], edge_index_bi[1]),
    torch.max(edge_index_bi[0], edge_index_bi[1])
], dim=0)

edge_index_unique = torch.unique(sorted_edge_index, dim=1)

# Criar o objeto PyG
data = Data(
    x=node_features,
    edge_index=edge_index_unique
)

print("Grafo intra-projeto criado (bidirecional):")
print("  - Nós:", data.x.shape)
print("  - Arestas:", data.edge_index.shape)


# Função para filtrar subgrafos isolados
def filter_edges_by_split(edge_index, train_idx, val_idx, test_idx):
    allowed_train_val = set(train_idx.tolist() + val_idx.tolist())
    allowed_test = set(test_idx.tolist())

    filtered_edges = []

    for src, dst in edge_index.T.tolist():
        if (src in allowed_train_val and dst in allowed_train_val) or \
           (src in allowed_test and dst in allowed_test):
            filtered_edges.append([src, dst])

    return torch.tensor(filtered_edges, dtype=torch.long).T


Grafo intra-projeto criado (bidirecional):
  - Nós: torch.Size([2796, 384])
  - Arestas: torch.Size([2, 6500])


In [18]:
# CONFIGURAÇÃO DE RUNS 
N_RUNS = 10
OUT_DIR = f"runs_results_{project_name}"
os.makedirs(OUT_DIR, exist_ok=True)

# Hiperparâmetros 
BATCH_SIZE = 8
EPOCHS = 200
LEARNING_RATE = 1e-4
DROPOUT = 0
PATIENCE = 5
NORMALIZE_TYPE = "minmax"  # "minmax", "standard", or None
SEED = 42

device = DEVICE  


class GATModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, heads=2):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1)  # ou outro heads

        self.lin = nn.Linear(hidden_channels, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        return self.lin(x).view(-1)

# Salvar backup do edge_index original (não filtrado)
edge_index_orig = data.edge_index.cpu().clone().detach()

# Funções utilitárias
def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def prepare_split_and_loaders(seed, edge_index_backup):
    """
    Restaura edge_index, cria split (70/15/15) com seed fornecida,
    ajusta scaler somente com train, define data.y escalonado, e cria NeighborLoaders.
    Retorna: scaler_fit, train_loader, val_loader, test_loader, train_idx, val_idx, test_idx
    """
    # Restaura grafo global
    data.edge_index = edge_index_backup.clone().to(data.x.device)

    # Indices
    all_indices = np.arange(data.num_nodes)

    # Split 
    train_idx, temp_idx = train_test_split(all_indices, test_size=0.3, random_state=seed)
    val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=seed)

    # Ajustar scaler 
    sp_train = df_project.loc[train_idx, "Story_Point"].fillna(0).values.reshape(-1, 1)

    if NORMALIZE_TYPE == "minmax":
        scaler_local = MinMaxScaler().fit(sp_train)
    elif NORMALIZE_TYPE == "standard":
        scaler_local = StandardScaler().fit(sp_train)
    else:
        scaler_local = None

    # Transformar todos os rótulos (train+val+test) com scaler ajustado no train
    sp_all = df_project["Story_Point"].fillna(0).values.reshape(-1, 1)
    if scaler_local is not None:
        sp_all_scaled = scaler_local.transform(sp_all).ravel()
    else:
        sp_all_scaled = sp_all.ravel()

    data.y = torch.tensor(sp_all_scaled, dtype=torch.float32)

    # Filtrar arestas entre grupos diferentes
    data.edge_index = filter_edges_by_split(
        edge_index=data.edge_index,
        train_idx=train_idx,
        val_idx=val_idx,
        test_idx=test_idx
    ).to(data.x.device)

    # Criar loaders
    train_loader = NeighborLoader(
        data,
        num_neighbors=[3,3],
        batch_size=BATCH_SIZE,
        input_nodes=torch.tensor(train_idx),
        shuffle=True
    )

    val_loader = NeighborLoader(
        data,
        num_neighbors=[3,3],
        batch_size=BATCH_SIZE,
        input_nodes=torch.tensor(val_idx),
        shuffle=False
    )

    test_loader = NeighborLoader(
        data,
        num_neighbors=[3,3],
        batch_size=BATCH_SIZE,
        input_nodes=torch.tensor(test_idx),
        shuffle=False
    )

    return scaler_local, train_loader, val_loader, test_loader, train_idx, val_idx, test_idx


def evaluate_model(model, dataloader, criterion):
    model.eval()
    predictions = []
    actuals = []
    total_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            outputs = model(batch).cpu().numpy()
            targets = batch.y.cpu().numpy()

            loss = criterion(torch.tensor(outputs), torch.tensor(targets))
            total_loss += loss.item()

            predictions.extend(outputs)
            actuals.extend(targets)

    mae = mean_absolute_error(actuals, predictions)
    mse = mean_squared_error(actuals, predictions)
    mdae = median_absolute_error(actuals, predictions)
    errors = np.abs(np.array(predictions) - np.array(actuals))
    with np.errstate(divide='ignore', invalid='ignore'):
        within_50 = np.where(np.array(actuals) != 0, errors <= (0.50 * np.array(actuals)), errors == 0)
    pred_50 = float(np.mean(within_50))

    return total_loss / max(1, len(dataloader)), float(mae), float(mse), float(mdae), float(pred_50)

# Loop principal de runs 
all_run_results = []

print(f"Starting {N_RUNS} runs (base seed={SEED})")
for run_id in range(N_RUNS):
    run_seed = SEED + run_id
    set_all_seeds(run_seed)

    # preparar dados / loaders (restaurando edge_index original)
    scaler_run, train_loader, val_loader, test_loader, train_idx, val_idx, test_idx = prepare_split_and_loaders(run_seed, edge_index_orig)

    # criar modelo novo para a run
    in_channels = data.x.size(1)
    hidden_dim = 64
    model = GATModel(in_channels=in_channels, hidden_channels=hidden_dim, heads=2).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=(PATIENCE // 2)+1, factor=0.5)
    criterion = nn.MSELoss()

    best_val_loss = float("inf")
    epochs_no_improve = 0
    best_model_state = None
    train_losses = []
    val_mse_history = []

    start_time = time.time()
    print(f"\n--- Run {run_id+1}/{N_RUNS} (seed={run_seed}) ---")
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            outputs = model(batch)
            targets = batch.y.to(device)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / max(1, len(train_loader))
        train_losses.append(avg_train_loss)

        # validação
        _, _, val_mse, *_ = evaluate_model(model, val_loader, criterion)
        val_mse_history.append(val_mse)

        print(f"\rRun {run_id+1}/{N_RUNS} Epoch {epoch+1}/{EPOCHS} - Train Loss: {avg_train_loss:.6f} - Val MSE: {val_mse:.6f}", end="", flush=True)

        # early stopping logic
        if val_mse < best_val_loss:
            best_val_loss = val_mse
            epochs_no_improve = 0
            best_model_state = copy.deepcopy(model.state_dict())
        else:
            epochs_no_improve += 1

        scheduler.step(val_mse)

        if epochs_no_improve >= PATIENCE:
            print(f"\nEarly stopping (run {run_id+1}) after epoch {epoch+1}. Best Val MSE: {best_val_loss:.6f}")
            break

    # restaura melhor modelo
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    elapsed = time.time() - start_time

    # avaliação final no teste
    _, mae_scaled, mse_scaled, mdae_scaled, pred50_scaled = evaluate_model(model, test_loader, criterion)

    # converter métricas para escala real 
    if scaler_run is not None:
        if NORMALIZE_TYPE == "minmax":
            SP_MIN, SP_MAX = df_project["Story_Point"].min(), df_project["Story_Point"].max()
            scale_factor = SP_MAX - SP_MIN
            if scale_factor == 0:
                scale_factor = 1.0
            test_metrics_real = {
                "MAE": mae_scaled * scale_factor,
                "MSE": mse_scaled * (scale_factor ** 2),
                "MdAE": mdae_scaled * scale_factor,
                "Pred(50)": pred50_scaled
            }
        elif NORMALIZE_TYPE == "standard":
            std_dev_real = df_project["Story_Point"].std()
            if std_dev_real == 0:
                std_dev_real = 1.0
            test_metrics_real = {
                "MAE": mae_scaled * std_dev_real,
                "MSE": mse_scaled * (std_dev_real ** 2),
                "MdAE": mdae_scaled * std_dev_real,
                "Pred(50)": pred50_scaled
            }
        else:
            test_metrics_real = {"MAE": mae_scaled, "MSE": mse_scaled, "MdAE": mdae_scaled, "Pred(50)": pred50_scaled}
    else:
        test_metrics_real = {"MAE": mae_scaled, "MSE": mse_scaled, "MdAE": mdae_scaled, "Pred(50)": pred50_scaled}

    # salvar resultados da run
    run_result = {
        "run_id": run_id + 1,
        "seed": run_seed,
        "elapsed_sec": elapsed,
        "best_val_mse": float(best_val_loss),
        "epochs_trained": epoch+1,
        "train_loss_history": train_losses,
        "val_mse_history": val_mse_history,
        "test_metrics_scaled": {"MAE": float(mae_scaled), "MSE": float(mse_scaled), "MdAE": float(mdae_scaled), "Pred(50)": float(pred50_scaled)},
        "test_metrics_real": {k: float(v) for k, v in test_metrics_real.items()}
    }

    # salvar arquivo json e pesos
    json_path = os.path.join(OUT_DIR, f"{VERSAO_MODELO}_run{run_id+1}_seed{run_seed}.json")
    torch_path = os.path.join(OUT_DIR, f"{VERSAO_MODELO}_run{run_id+1}_seed{run_seed}.pt")
    with open(json_path, "w") as f:
        json.dump(run_result, f, indent=4)
    torch.save(best_model_state, torch_path)

    all_run_results.append(run_result)

    # liberar cache GPU
    try:
        torch.cuda.empty_cache()
    except Exception:
        pass

# --- Consolidação final ---
maes = [r["test_metrics_real"]["MAE"] for r in all_run_results]
mses = [r["test_metrics_real"]["MSE"] for r in all_run_results]
mdaes = [r["test_metrics_real"]["MdAE"] for r in all_run_results]
pred50s = [r["test_metrics_real"]["Pred(50)"] for r in all_run_results]

summary = {
    "n_runs": N_RUNS,
    "seed_base": SEED,
    "MAE_mean": float(statistics.mean(maes)),
    "MAE_std": float(statistics.pstdev(maes)),
    "MSE_mean": float(statistics.mean(mses)),
    "MSE_std": float(statistics.pstdev(mses)),
    "MdAE_mean": float(statistics.mean(mdaes)),
    "MdAE_std": float(statistics.pstdev(mdaes)),
    "Pred50_mean": float(statistics.mean(pred50s)),
    "Pred50_std": float(statistics.pstdev(pred50s)),
    "per_run_files": [os.path.basename(f"{VERSAO_MODELO}_run{r['run_id']}_seed{r['seed']}.json") for r in all_run_results]
}

summary_path = os.path.join(OUT_DIR, f"{VERSAO_MODELO}_summary_{N_RUNS}runs.json")
with open(summary_path, "w") as f:
    json.dump({"summary": summary, "runs": all_run_results}, f, indent=4)

print("\nAll runs finished.")
print("Summary saved to:", summary_path)
print("MAE mean (real scale):", summary["MAE_mean"], "±", summary["MAE_std"])


Starting 10 runs (base seed=42)

--- Run 1/10 (seed=42) ---




Run 1/10 Epoch 5/200 - Train Loss: 0.021274 - Val MSE: 0.019402

KeyboardInterrupt: 