# Build XGB using GNN embeddings
In this notebook we build a simple GNN model and combine it with XGB model to see if GNN improves XGB

# Simple GNN Baseline

In [1]:
import os
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "7")

LOAD = 2
VER=7

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

KeyboardInterrupt: 

In [None]:
author_features = pd.read_parquet(f"../data/author_features_v{LOAD}.pqt")
author_targets = pd.read_parquet(f"../data/author_targets_v{LOAD}.pqt")

In [None]:
author_targets.head()

Unnamed: 0,Author_ID,target,coauthor_ids,coauthor_counts,coauthor_time_delta,degree
0,0,0,[],[],[],0
1,1,0,"[2, 3, 4, 5, 6, 7, 8]","[1, 1, 1, 1, 1, 1, 1]","[1461, 1461, 1461, 1461, 1461, 1461, 1461]",7
2,2,0,"[1, 3, 4, 5, 6, 7, 8]","[1, 1, 1, 1, 1, 1, 1]","[1461, 1461, 1461, 1461, 1461, 1461, 1461]",7
3,3,0,"[1, 2, 4, 5, 6, 7, 8]","[1, 1, 1, 1, 1, 1, 1]","[1461, 1461, 1461, 1461, 1461, 1461, 1461]",7
4,4,1,"[1, 2, 3, 5, 6, 7, 8, 2905, 2819, 2906, 2907, ...","[1, 1, 1, 4, 4, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, ...","[1461, 1461, 1461, 761, 761, 1461, 1461, 1013,...",38


In [None]:
N = len(author_features)
assert author_features["Author_ID"].is_unique

id2idx = pd.Series(np.arange(N), index=author_features["Author_ID"].values)

In [None]:
# LOG TRANSFORM FOR NN
for c in [
    "n_coauthors",
    "total_collaborations",
    "total_papers",
    "total_citations",
    "citations_last_3y",
    "max_citations_single_paper",
]:
    author_features[f"log_{c}"] = np.log1p(author_features[c])

In [None]:
TARGET = "target"

BASE_FEATURES = [
    # --------------------
    # Collaboration graph
    # --------------------
    "n_coauthors",
    "total_collaborations",
    "avg_collab_strength",
    "max_collab_strength",
    "collab_entropy",

    # --------------------
    # Productivity / recency
    # --------------------
    "total_papers",
    "papers_last_1y",
    "papers_last_3y",
    "days_since_last_paper",

    # --------------------
    # Topic specialization
    # --------------------
    #"top_category",          # categorical-as-ordinal (needs encoding for NN)
    "top_category_frac",
    "category_entropy",

    # --------------------
    # Citation impact
    # --------------------
    "total_citations",
    "avg_citations_per_paper",
    "max_citations_single_paper",
    "citations_last_3y",
]

EXTRA_FEATURES = [
    "log_n_coauthors",
    "log_total_collaborations",
    "log_total_papers",
    "log_total_citations",
    "log_citations_last_3y",
    "log_max_citations_single_paper",
]

FEATURES = [c for c in (BASE_FEATURES + EXTRA_FEATURES) if c in author_features.columns]
print("Using features:", FEATURES)

Using features: ['n_coauthors', 'total_collaborations', 'avg_collab_strength', 'max_collab_strength', 'collab_entropy', 'total_papers', 'papers_last_1y', 'papers_last_3y', 'days_since_last_paper', 'top_category_frac', 'category_entropy', 'total_citations', 'avg_citations_per_paper', 'max_citations_single_paper', 'citations_last_3y', 'log_n_coauthors', 'log_total_collaborations', 'log_total_papers', 'log_total_citations', 'log_citations_last_3y', 'log_max_citations_single_paper']


In [None]:
X = author_features[FEATURES].astype(np.float32).values
y = author_features[TARGET].astype(np.int64).values

# standardize
X_mean = X.mean(axis=0, keepdims=True)
X_std  = X.std(axis=0, keepdims=True) + 1e-6
X = (X - X_mean) / X_std

x = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.long)

In [None]:
"""
# Make sure coauthor lists align to author_features order
author_targets_aligned = author_targets.set_index("Author_ID").loc[author_features["Author_ID"]].reset_index()

src_list = []
dst_list = []
w_list   = []

for i, (co_ids, co_cts) in enumerate(
    zip(author_targets_aligned["coauthor_ids"], author_targets_aligned["coauthor_counts"])
):
    # skip isolated authors
    if len(co_ids) == 0:
        continue

    js = id2idx.loc[co_ids].values.astype(np.int64)
    cts = np.asarray(co_cts, dtype=np.float32)

    src_list.append(np.full(len(js), i, dtype=np.int64))
    dst_list.append(js)
    w_list.append(cts)

src = np.concatenate(src_list) if src_list else np.empty(0, dtype=np.int64)
dst = np.concatenate(dst_list) if dst_list else np.empty(0, dtype=np.int64)
w   = np.concatenate(w_list)   if w_list   else np.empty(0, dtype=np.float32)

edge_index = torch.tensor(np.vstack([src, dst]), dtype=torch.long)
edge_weight = torch.tensor(w, dtype=torch.float)
"""
# Make sure coauthor lists align to author_features order
author_targets_aligned = author_targets.set_index("Author_ID").loc[author_features["Author_ID"]].reset_index()

src_list = []
dst_list = []
edge_attr_list = [] # Changed from w_list to edge_attr_list

# Iterate over coauthor_ids, counts, AND time_deltas
# Ensure 'coauthor_time_delta' exists in your dataframe!
for i, (co_ids, co_cts, co_times) in enumerate(
    zip(
        author_targets_aligned["coauthor_ids"], 
        author_targets_aligned["coauthor_counts"],
        author_targets_aligned["coauthor_time_delta"] # New Column
    )
):
    # skip isolated authors
    if len(co_ids) == 0:
        continue

    js = id2idx.loc[co_ids].values.astype(np.int64)
    
    # Process edge features
    cts = np.asarray(co_cts, dtype=np.float32)
    times = np.asarray(co_times, dtype=np.float32)
    
    # Log transform features to stabilize training (recommended)
    cts_log = np.log1p(cts)
    times_log = np.log1p(times)
    
    # Stack features to create [num_edges, 2] matrix
    edge_features = np.stack([cts_log, times_log], axis=1)

    src_list.append(np.full(len(js), i, dtype=np.int64))
    dst_list.append(js)
    edge_attr_list.append(edge_features)

src = np.concatenate(src_list) if src_list else np.empty(0, dtype=np.int64)
dst = np.concatenate(dst_list) if dst_list else np.empty(0, dtype=np.int64)

# Create 2D edge attributes instead of 1D weights
edge_attr = np.concatenate(edge_attr_list) if edge_attr_list else np.empty((0, 2), dtype=np.float32)

edge_index = torch.tensor(np.vstack([src, dst]), dtype=torch.long)
edge_attr = torch.tensor(edge_attr, dtype=torch.float)

In [None]:
print("Num nodes:", N)
print("Num edges (directed):", edge_index.size(1))

Num nodes: 143691
Num edges (directed): 11182120


In [None]:
data = Data(x=x, edge_index=edge_index, y=y)
data.edge_attr = edge_attr  # Assign the 2D features here

In [None]:
from torch_geometric.nn import TransformerConv

class EdgeGAT(torch.nn.Module): # Class name kept same for compatibility
    def __init__(self, in_dim, hidden_dim=128, dropout=0.1):
        super().__init__()
        # Switch to TransformerConv to handle edge_dim=2
        self.conv1 = TransformerConv(in_dim, hidden_dim, edge_dim=2)
        self.conv2 = TransformerConv(hidden_dim, hidden_dim, edge_dim=2)
        self.lin   = torch.nn.Linear(hidden_dim, 1)
        self.dropout = dropout

    def forward(self, x, edge_index, edge_attr, return_emb=False):
        # Pass edge_attr to the conv layers
        h = self.conv1(x, edge_index, edge_attr)
        h = F.relu(h)
        h = F.dropout(h, p=self.dropout, training=self.training)

        h = self.conv2(h, edge_index, edge_attr)
        h = F.relu(h)
        h = F.dropout(h, p=self.dropout, training=self.training)

        if return_emb:
            return h

        out = self.lin(h).squeeze(-1)
        return out

In [None]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=128, dropout=0.1):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.lin   = torch.nn.Linear(hidden_dim, 1)
        self.dropout = dropout

    def forward(self, x, edge_index, return_emb=False):
        h = self.conv1(x, edge_index)
        h = F.relu(h)
        h = F.dropout(h, p=self.dropout, training=self.training)

        h = self.conv2(h, edge_index)
        h = F.relu(h)
        h = F.dropout(h, p=self.dropout, training=self.training)

        if return_emb:
            return h

        out = self.lin(h).squeeze(-1)
        return out

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = data.to(device)

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

N = data.num_nodes
EMB_DIM = 128

# OOF storage (optional, for diagnostics)
oof_gnn = np.zeros(N, dtype=np.float32)
oof_embs  = np.zeros((N, EMB_DIM), dtype=np.float32)

fold_aucs   = []
fold_states = []        
fold_splits = []        

def train_one_fold(train_idx, val_idx, epochs=300, lr=5e-3, wd=1e-4):
    """
    model = GraphSAGE(
        in_dim=data.x.size(1),
        hidden_dim=EMB_DIM,
        dropout=0.1
    ).to(device)
    """
    model = EdgeGAT(
        in_dim=data.x.size(1),
        hidden_dim=EMB_DIM,
        dropout=0.1
    ).to(device)
    
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    train_mask = torch.zeros(N, dtype=torch.bool, device=device)
    val_mask   = torch.zeros(N, dtype=torch.bool, device=device)
    train_mask[torch.tensor(train_idx, device=device)] = True
    val_mask[torch.tensor(val_idx, device=device)] = True

    y_float = data.y.float()

    best_auc = -1.0
    best_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        opt.zero_grad()

        """
        logits = model(data.x, data.edge_index)
        """
        logits = model(data.x, data.edge_index, data.edge_attr)

        loss = F.binary_cross_entropy_with_logits(
            logits[train_mask],
            y_float[train_mask]
        )

        loss.backward()
        opt.step()

        # validation AUC
        model.eval()
        with torch.no_grad():
            val_probs = torch.sigmoid(logits[val_mask]).cpu().numpy()
            val_true  = data.y[val_mask].cpu().numpy()
            auc = roc_auc_score(val_true, val_probs)

        if auc > best_auc:
            best_auc = auc
            best_state = {
                k: v.detach().cpu().clone()
                for k, v in model.state_dict().items()
            }

        if epoch % 30 == 0 or epoch == 1:
            print(
                f"  epoch {epoch:3d} | "
                f"loss {loss.item():.4f} | "
                f"val AUC {auc:.5f} | "
                f"best {best_auc:.5f}"
            )

    # restore best model
    model.load_state_dict({k: v.to(device) for k, v in best_state.items()})
    model.eval()

    with torch.no_grad():
        """logits = model(data.x, data.edge_index)"""
        logits = model(data.x, data.edge_index, data.edge_attr)
        val_probs = torch.sigmoid(logits[val_mask]).cpu().numpy()

        """emb = model(data.x, data.edge_index, return_emb=True)"""
        emb = model(data.x, data.edge_index, data.edge_attr, return_emb=True)
        val_emb = emb[val_mask].cpu().numpy()

    return best_auc, best_state, val_probs, val_emb

: 

In [None]:
for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(N)), 1):
    print(f"\n##### FOLD {fold} #####")

    auc, best_state, val_probs, val_emb = train_one_fold(
        train_idx,
        val_idx,
        epochs=300,
        lr=5e-3,
        wd=1e-4
    )

    fold_aucs.append(auc)
    fold_states.append(best_state)        # <-- SAVE MODEL
    fold_splits.append((train_idx, val_idx))  # <-- SAVE SPLIT

    oof_gnn[val_idx] = val_probs
    oof_embs[val_idx]  = val_emb

    print(f"Fold {fold} best AUC: {auc:.5f}")

print("\n==============================")
print("Fold AUCs:", [f"{a:.5f}" for a in fold_aucs])
print("Mean AUC :", float(np.mean(fold_aucs)))
print("OOF AUC  :", float(roc_auc_score(data.y.cpu().numpy(), oof_gnn)))
print("==============================")


##### FOLD 1 #####


In [None]:
# SAVE GNN OOF
np.save(f"data/oof_preds_gnn_v{VER}",oof_gnn)

# Train XGB w/ GNN Embeddings
We will now train XGB using GNN embeddings. This performs better than simple average of XGB and GNN probability predictions that was done in the previous notebook.

In [None]:
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.05,
    "max_depth": 6,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "n_estimators": 10_000,
    "random_state": 42,
    "early_stopping_rounds": 50,
    "device": "cuda",
}

In [None]:
import xgboost as xgb
print(f"XGB version: {xgb.__version__}")

XGB version: 3.1.2


In [None]:
from scipy.stats import rankdata
from cuml.decomposition import PCA

# tabular features
X_tab = author_features[FEATURES].values.astype(np.float32)
y = author_features[TARGET].values.astype(np.int64)

N = len(y)
oof_preds = np.zeros(N, dtype=np.float32)
fold_scores = []

# IMPORTANT: reuse the SAME splits as GNN training
for fold, ((train_idx, val_idx), gnn_state) in enumerate(
    zip(fold_splits, fold_states), 1
):
    print(f"\n##### STACK FOLD {fold} #####")

    # -----------------------------
    # Load fold-specific GNN
    # -----------------------------
    gnn = GraphSAGE(
        in_dim=data.x.size(1),
        hidden_dim=EMB_DIM,
        dropout=0.1
    ).to(device)

    gnn.load_state_dict({k: v.to(device) for k, v in gnn_state.items()})
    gnn.eval()

    # -----------------------------
    # Extract embeddings (same space)
    # -----------------------------
    with torch.no_grad():
        emb_all = gnn(
            data.x,
            data.edge_index,
            return_emb=True
        ).cpu().numpy().astype(np.float32)

    # -----------------------------
    # PCA on embeddings (FIT = train only)
    # -----------------------------
    pca = PCA(n_components=8)
    emb_train_pca = pca.fit_transform(emb_all[train_idx])
    emb_val_pca   = pca.transform(emb_all[val_idx])

    # -----------------------------
    # Build stacked features
    # -----------------------------
    X_train = np.hstack([
        X_tab[train_idx],
        emb_train_pca
    ])

    X_val = np.hstack([
        X_tab[val_idx],
        emb_val_pca
    ])

    y_train, y_val = y[train_idx], y[val_idx]

    # -----------------------------
    # Train stack XGB (regularized!)
    # -----------------------------
    model = xgb.XGBClassifier(**xgb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )

    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = rankdata(val_preds)

    auc = roc_auc_score(y_val, val_preds)
    fold_scores.append(auc)

    print(f"Fold {fold} AUC: {auc:.5f}")

print("\n==============================")
print("Fold AUCs:", [f"{s:.5f}" for s in fold_scores])
print("Mean AUC :", np.mean(fold_scores))
print("OOF AUC  :", roc_auc_score(y, oof_preds))
print("==============================")


##### STACK FOLD 1 #####
Fold 1 AUC: 0.75610

##### STACK FOLD 2 #####
Fold 2 AUC: 0.75643

##### STACK FOLD 3 #####
Fold 3 AUC: 0.76022

##### STACK FOLD 4 #####
Fold 4 AUC: 0.76307

##### STACK FOLD 5 #####
Fold 5 AUC: 0.75704

Fold AUCs: ['0.75610', '0.75643', '0.76022', '0.76307', '0.75704']
Mean AUC : 0.7585728593018518
OOF AUC  : 0.7585639778208273


In [None]:
# SAVE GNN OOF
np.save(f"data/oof_preds_xgb_gnn_v{VER}",oof_preds)

# Compare to Ensemble

In [None]:
oof_xgb = np.load(f"data/oof_preds_xgb_v{LOAD}.npy")
print("XGB OOF AUC  :", float(roc_auc_score(author_features[TARGET].values, oof_xgb)))

oof_gnn = np.load(f"data/oof_preds_gnn_v{VER}.npy")
print("GNN OOF AUC  :", float(roc_auc_score(author_features[TARGET].values, oof_gnn)))

blend = (oof_xgb + oof_gnn)/2.
print("XGB GNN BLEND  :", float(roc_auc_score(author_features[TARGET].values, blend)))

oof_xgb_gnn = np.load(f"data/oof_preds_xgb_gnn_v{VER}.npy")
print("XGB w/ GNN emb  :", float(roc_auc_score(author_features[TARGET].values, oof_xgb_gnn)))

XGB OOF AUC  : 0.755080970524253
GNN OOF AUC  : 0.7556822341151668

XGB GNN BLEND  : 0.7576088669046419
XGB w/ GNN emb  : 0.7585639778208273
