# Build GNN and Ensemble with XGB
In this notebook we build a simple GNN model and combine it with XGB model to see if GNN improves XGB

# Simple GNN Baseline

In [1]:
import os
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "7")

VER=2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

In [3]:
author_features = pd.read_parquet(f"data/author_features_v{VER}.pqt")
author_targets = pd.read_parquet(f"data/author_targets_v{VER}.pqt")

In [4]:
author_targets.head()

Unnamed: 0,Author_ID,target,coauthor_ids,coauthor_counts,degree
0,0,0,[],[],0
1,1,0,"[2, 3, 4, 5, 6, 7, 8]","[1, 1, 1, 1, 1, 1, 1]",7
2,2,0,"[1, 3, 4, 5, 6, 7, 8]","[1, 1, 1, 1, 1, 1, 1]",7
3,3,0,"[1, 2, 4, 5, 6, 7, 8]","[1, 1, 1, 1, 1, 1, 1]",7
4,4,1,"[1, 2, 3, 5, 6, 7, 8, 2905, 2819, 2906, 2907, ...","[1, 1, 1, 4, 4, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, ...",38


In [5]:
N = len(author_features)
assert author_features["Author_ID"].is_unique

id2idx = pd.Series(np.arange(N), index=author_features["Author_ID"].values)

In [6]:
# LOG TRANSFORM FOR NN
for c in [
    "n_coauthors",
    "total_collaborations",
    "total_papers",
    "total_citations",
    "citations_last_3y",
    "max_citations_single_paper",
]:
    author_features[f"log_{c}"] = np.log1p(author_features[c])

In [7]:
TARGET = "target"

BASE_FEATURES = [
    # --------------------
    # Collaboration graph
    # --------------------
    "n_coauthors",
    "total_collaborations",
    "avg_collab_strength",
    "max_collab_strength",
    "collab_entropy",

    # --------------------
    # Productivity / recency
    # --------------------
    "total_papers",
    "papers_last_1y",
    "papers_last_3y",
    "days_since_last_paper",

    # --------------------
    # Topic specialization
    # --------------------
    #"top_category",          # categorical-as-ordinal (needs encoding for NN)
    "top_category_frac",
    "category_entropy",

    # --------------------
    # Citation impact
    # --------------------
    "total_citations",
    "avg_citations_per_paper",
    "max_citations_single_paper",
    "citations_last_3y",
]

EXTRA_FEATURES = [
    "log_n_coauthors",
    "log_total_collaborations",
    "log_total_papers",
    "log_total_citations",
    "log_citations_last_3y",
    "log_max_citations_single_paper",
]

FEATURES = [c for c in (BASE_FEATURES + EXTRA_FEATURES) if c in author_features.columns]
print("Using features:", FEATURES)

Using features: ['n_coauthors', 'total_collaborations', 'avg_collab_strength', 'max_collab_strength', 'collab_entropy', 'total_papers', 'papers_last_1y', 'papers_last_3y', 'days_since_last_paper', 'top_category_frac', 'category_entropy', 'total_citations', 'avg_citations_per_paper', 'max_citations_single_paper', 'citations_last_3y', 'log_n_coauthors', 'log_total_collaborations', 'log_total_papers', 'log_total_citations', 'log_citations_last_3y', 'log_max_citations_single_paper']


In [8]:
X = author_features[FEATURES].astype(np.float32).values
y = author_features[TARGET].astype(np.int64).values

# standardize
X_mean = X.mean(axis=0, keepdims=True)
X_std  = X.std(axis=0, keepdims=True) + 1e-6
X = (X - X_mean) / X_std

x = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.long)

In [9]:
# Make sure coauthor lists align to author_features order
author_targets_aligned = author_targets.set_index("Author_ID").loc[author_features["Author_ID"]].reset_index()

src_list = []
dst_list = []
w_list   = []

for i, (co_ids, co_cts) in enumerate(
    zip(author_targets_aligned["coauthor_ids"], author_targets_aligned["coauthor_counts"])
):
    # skip isolated authors
    if len(co_ids) == 0:
        continue

    js = id2idx.loc[co_ids].values.astype(np.int64)
    cts = np.asarray(co_cts, dtype=np.float32)

    src_list.append(np.full(len(js), i, dtype=np.int64))
    dst_list.append(js)
    w_list.append(cts)

src = np.concatenate(src_list) if src_list else np.empty(0, dtype=np.int64)
dst = np.concatenate(dst_list) if dst_list else np.empty(0, dtype=np.int64)
w   = np.concatenate(w_list)   if w_list   else np.empty(0, dtype=np.float32)

edge_index = torch.tensor(np.vstack([src, dst]), dtype=torch.long)
edge_weight = torch.tensor(w, dtype=torch.float)

In [10]:
print("Num nodes:", N)
print("Num edges (directed):", edge_index.size(1))

Num nodes: 143691
Num edges (directed): 11182120


In [11]:
data = Data(x=x, edge_index=edge_index, y=y)
data.edge_weight = edge_weight  # optional

In [12]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=128, dropout=0.2):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.lin   = torch.nn.Linear(hidden_dim, 1)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        out = self.lin(x).squeeze(-1)  # logits
        return out

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = data.to(device)

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(N, dtype=np.float32)
fold_aucs = []

def train_one_fold(train_idx, val_idx, epochs=50, lr=1e-3, wd=1e-4):
    model = GraphSAGE(in_dim=data.x.size(1), hidden_dim=128, dropout=0.1).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    train_mask = torch.zeros(N, dtype=torch.bool, device=device)
    val_mask   = torch.zeros(N, dtype=torch.bool, device=device)
    train_mask[torch.tensor(train_idx, device=device)] = True
    val_mask[torch.tensor(val_idx, device=device)] = True

    # BCE with logits
    y_float = data.y.float()

    best_auc = -1
    best_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        opt.zero_grad()
        logits = model(data.x, data.edge_index)
        loss = F.binary_cross_entropy_with_logits(logits[train_mask], y_float[train_mask])
        loss.backward()
        opt.step()

        # quick val AUC
        model.eval()
        with torch.no_grad():
            val_probs = torch.sigmoid(logits[val_mask]).detach().cpu().numpy()
            val_true  = data.y[val_mask].detach().cpu().numpy()
            auc = roc_auc_score(val_true, val_probs)

        if auc > best_auc:
            best_auc = auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

        if epoch % 30 == 0 or epoch == 1:
            print(f"  epoch {epoch:3d} | loss {loss.item():.4f} | val AUC {auc:.5f} | best {best_auc:.5f}")

    # load best
    if best_state is not None:
        model.load_state_dict({k: v.to(device) for k, v in best_state.items()})

    # final val preds
    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index)
        val_probs = torch.sigmoid(logits[val_mask]).detach().cpu().numpy()

    return best_auc, val_probs

for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(N)), 1):
    print(f"\n##### FOLD {fold} #####")
    auc, val_probs = train_one_fold(train_idx, val_idx, epochs=300, lr=5e-3, wd=1e-4)
    fold_aucs.append(auc)
    oof_preds[val_idx] = val_probs
    print(f"Fold {fold} best AUC: {auc:.5f}")

print("\n==============================")
print("Fold AUCs:", [f"{a:.5f}" for a in fold_aucs])
print("Mean AUC :", float(np.mean(fold_aucs)))
print("OOF AUC  :", float(roc_auc_score(author_features[TARGET].values, oof_preds)))
print("==============================")


##### FOLD 1 #####
  epoch   1 | loss 0.7222 | val AUC 0.43801 | best 0.43801
  epoch  30 | loss 0.5730 | val AUC 0.74629 | best 0.74629
  epoch  60 | loss 0.5690 | val AUC 0.74853 | best 0.75065
  epoch  90 | loss 0.5669 | val AUC 0.74952 | best 0.75204
  epoch 120 | loss 0.5646 | val AUC 0.75032 | best 0.75204
  epoch 150 | loss 0.5631 | val AUC 0.75117 | best 0.75251
  epoch 180 | loss 0.5618 | val AUC 0.75053 | best 0.75251
  epoch 210 | loss 0.5606 | val AUC 0.75101 | best 0.75261
  epoch 240 | loss 0.5598 | val AUC 0.75169 | best 0.75261
  epoch 270 | loss 0.5594 | val AUC 0.75093 | best 0.75302
  epoch 300 | loss 0.5575 | val AUC 0.75091 | best 0.75314
Fold 1 best AUC: 0.75314

##### FOLD 2 #####
  epoch   1 | loss 0.6815 | val AUC 0.60414 | best 0.60414
  epoch  30 | loss 0.5733 | val AUC 0.74551 | best 0.74551
  epoch  60 | loss 0.5688 | val AUC 0.74836 | best 0.75033
  epoch  90 | loss 0.5667 | val AUC 0.74895 | best 0.75188
  epoch 120 | loss 0.5649 | val AUC 0.75151 | best

In [14]:
# SAVE GNN OOF
np.save(f"data/oof_preds_gnn_v{VER}",oof_preds)

# Ensemble XGB and GNN
This is a simple experiment to see if using both XGB and GNN improves performance compared to using only one individually.

In [15]:
oof_xgb = np.load(f"data/oof_preds_xgb_v{VER}.npy")
print("XGB OOF AUC  :", float(roc_auc_score(author_features[TARGET].values, oof_xgb)))

oof_gnn = np.load(f"data/oof_preds_gnn_v{VER}.npy")
print("GNN OOF AUC  :", float(roc_auc_score(author_features[TARGET].values, oof_gnn)))

blend = (oof_xgb + oof_gnn)/2.
print("BLEND OOF AUC  :", float(roc_auc_score(author_features[TARGET].values, blend)))

XGB OOF AUC  : 0.755080970524253
GNN OOF AUC  : 0.7560026830991697
BLEND OOF AUC  : 0.7577506691419102
