In [1]:
from ImportLocalData import loadData

In [2]:
from BalanceClassDistribution import AdjustClassSamples, NumberOfSamplesClass

In [None]:
# Import one of the custom KG files
# Call BalanceClass... to handle outliers if you need
# Please change path names based on your local files 
data = loadData('.../node_features.txt', '.../edges.txt', '.../edge_features.txt', '.../node_labels.txt')
#data = AdjustClassSamples(data) #this is optional, yet in paper we used

Import required packages

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from torch.nn import LayerNorm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch_geometric.utils import subgraph
import numpy as np
from torch_geometric.data import Data

Definition of StableGCN

In [10]:
device = torch.device("cpu") #cuda" if torch.cuda.is_available() else "cpu") # if you have gpu you can enable it

# GCN with Layer Normalization and Skip Conections brings stabilization and model's name come from this info
class StableGCN(torch.nn.Module):
    def __init__(self, in_dim, h_dim, out_dim):
        super(StableGCN, self).__init__()
        self.c1 = GCNConv(in_dim, h_dim)
        self.c2 = GCNConv(h_dim, h_dim)
        self.n1 = LayerNorm(h_dim)
        self.n2 = LayerNorm(h_dim)
        self.lin = torch.nn.Linear(h_dim, out_dim)
        self.dp = torch.nn.Dropout(0.5)
        
# Linear layer to align dimensions for the skip connection
        self.sk = torch.nn.Linear(in_dim, h_dim)
    def forward(self, d):
        x, e = d.x.to(device), d.edge_index.to(device)
        y = self.sk(x)
        x = self.c1(x, e)
        x = self.n1(x)
        x = F.relu(x)
        x = x + y # Skip connection
        x = self.dp(x)
        y = x
        x = self.c2(x, e)
        x = self.n2(x)
        x = F.relu(x)
        x = x + y # Skip connection
        x = self.dp(x)
        return self.lin(x)


Normalization of Data

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(data.x.cpu().numpy())  # Scaling data on the CPU
data.x = torch.tensor(x_scaled, dtype=torch.float).to(device)  # Move data back to GPU

# Split data with balanced number of classes before 5-Fold Cross Validation 
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# For keeping inductive and transductive results
inductive_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mrr': []}
transductive_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mrr': []}

# List to store all fold information and each epoch results for writing to file
all_logs = []

MRR Metric Definition 

In [7]:
def mrr(y_true, y_prob):
    rr = []
    for true, prob in zip(y_true, y_prob):
        order = np.argsort(prob)[::-1]
        rank = np.where(order == true)[0][0] + 1
        rr.append(1/rank)
    return np.mean(rr)

Training Model with 5-fold cross validation

In [None]:
#Since we apply 5-fold validation and calcuate all performance metrics for each of them to ensure robustness of the results
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(torch.arange(data.num_nodes), data.y.cpu().numpy())):
    print(f'Fold {fold_idx+1}/5')
    all_logs.append(f'Fold {fold_idx+1}/5')
    train_idx, val_idx = train_test_split(train_idx, test_size=0.1, stratify=data.y[train_idx].cpu().numpy())
    train_idx = torch.tensor(train_idx, dtype=torch.long)
    val_idx = torch.tensor(val_idx, dtype=torch.long)
    test_idx = torch.tensor(test_idx, dtype=torch.long)

    t_sub = subgraph(train_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    v_sub = subgraph(val_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    ts_sub = subgraph(test_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    g_sub = subgraph(torch.arange(data.num_nodes), data.edge_index, relabel_nodes=False, num_nodes=data.num_nodes)

    train_dat = Data(x=data.x[train_idx], edge_index=t_sub[0], y=data.y[train_idx])
    val_dat = Data(x=data.x[val_idx], edge_index=v_sub[0], y=data.y[val_idx])
    ind_test_dat = Data(x=data.x[test_idx], edge_index=ts_sub[0], y=data.y[test_idx])
    trans_test_dat = Data(x=data.x, edge_index=g_sub[0], y=data.y)
    bsz = 32
    loader_tr = DataLoader([train_dat], batch_size=bsz, shuffle=True)
    loader_val = DataLoader([val_dat], batch_size=bsz)
    loader_ti = DataLoader([ind_test_dat], batch_size=bsz)
    loader_tt = DataLoader([trans_test_dat], batch_size=bsz)

    feat_dim = data.num_node_features
    out_c = len(data.y.unique())
    hid = 256
    net = StableGCN(feat_dim, hid, out_c).to(device)
    opt = torch.optim.Adam(net.parameters(), lr=0.01, weight_decay=1e-4)
    lrsc = torch.optim.lr_scheduler.StepLR(opt, step_size=1000, gamma=0.75)
    lossf = torch.nn.CrossEntropyLoss()
    best_acc = 0
    best_weights = None

    for ep in range(4000):
        net.train()
        lval = 0
        for b in loader_tr:
            b = b.to(device)
            o = net(b)
            loss = lossf(o, b.y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
            opt.step()
            opt.zero_grad()
            lval += loss.item()
        lrsc.step()
        if (ep+1) % 100 == 0:
            net.eval()
            with torch.no_grad():
                v_loss, v_corr = 0, 0
                for vb in loader_val:
                    vb = vb.to(device)
                    outv = net(vb)
                    v_loss += lossf(outv, vb.y).item()
                    _, predv = outv.max(dim=1)
                    v_corr += predv.eq(vb.y).sum().item()
                v_loss /= len(loader_val)
                v_acc = v_corr / len(val_dat.y)
            log_line = f'Epoch: {ep+1}, Loss: {lval:.4f}, LR: {lrsc.get_last_lr()[0]:.6f}, Val Loss: {v_loss:.4f}, Val Acc: {v_acc:.4f}'
            print(log_line)
            all_logs.append(log_line)
            if v_acc > best_acc:
                best_acc = v_acc
                best_weights = net.state_dict()
            net.train()
            
    # Inductive Reasoning: All information of test data will not be used during training
    # Transductive Reasoning: All node and edge information is accessible during training, except for the labels of the test nodes
    for tag, loader, resdict in [
        ("Inductive", loader_ti, ind_res), ("Transductive", loader_tt, trans_res)]:
        net.load_state_dict(best_weights)
        net.eval()
        total, correct, pred_list, label_list, prob_list = 0, 0, [], [], []
        with torch.no_grad():
            for bt in loader:
                bt = bt.to(device)
                outt = net(bt)
                _, pred = outt.max(dim=1)
                total += bt.y.size(0)
                correct += pred.eq(bt.y).sum().item()
                pred_list.extend(pred.cpu().numpy())
                label_list.extend(bt.y.cpu().numpy())
                prob_list.extend(F.softmax(outt, dim=1).cpu().numpy())
        acc = correct / total if total else 0
        prec = precision_score(label_list, pred_list, average='weighted')
        rec = recall_score(label_list, pred_list, average='weighted')
        f1 = f1_score(label_list, pred_list, average='weighted')
        mrr_val = mrr(label_list, prob_list)
        resdict['accuracy'].append(acc)
        resdict['precision'].append(prec)
        resdict['recall'].append(rec)
        resdict['f1'].append(f1)
        resdict['mrr'].append(mrr_val)
        output = (f"{tag} Fold {fold_idx+1} | Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}, MRR: {mrr_val:.4f}")
        print(output)
        all_logs.append(output)

Calculatıon of means and stadard deviations of performance resuls of folds

In [None]:
def getAvgStd(resdict):
    m, s = {}, {}
    for k, v in resdict.items():
        m[k] = np.mean(v)
        s[k] = np.std(v)
    return m, s

meanInd, stdInd = getAvgStd(ind_res)
meanTrans, stdTrans = getAvgStd(trans_res)

print('Final Inductive:', meanInd)
print('Std Inductive:', stdInd)
print('Final Transductive:', meanTrans)
print('Std Transductive:', stdTrans)

In [None]:
# pnting the output to a .txt file
with open('.../results.txt', 'w') as f:
    for line in log_output:
        f.write(line + '\n')
    for line in output:
        f.write(line + '\n')       
for line in output:
    print(line)