In [2]:
from ImportLocalData import loadData

In [3]:
from BalanceClassDistribution import AdjustClassSamples, NumberOfSamplesClass

In [None]:
# Import one of the custom KG files
# Call BalanceClass... to handle outliers if you need
# Please change path names based on your local files 
data = loadData('.../node_features.txt', '.../edges.txt', '.../edge_features.txt', '.../node_labels.txt')
#data = AdjustClassSamples(data) #this is optional, yet in paper we used

Import required packages

In [16]:
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from torch.nn import LSTM, BatchNorm1d
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from torch_geometric.utils import subgraph, add_self_loops
from torch_geometric.loader import DataLoader
import torch
import numpy as np
from torch_geometric.data import Data
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

Definition of TemporalSAGE

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TemporalSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, lstm_hidden_size, out_channels):
        super(TemporalSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm1d(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm1d(hidden_channels)
        self.lstm = LSTM(hidden_channels, lstm_hidden_size, batch_first=True)
        self.fc = torch.nn.Linear(lstm_hidden_size, out_channels)
        self.dropout = torch.nn.Dropout(p=0.5)

    def forward(self, data):
        x, edge_index = data.x.to(device), data.edge_index.to(device)
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = x.view(data.num_nodes, -1, x.size(1))
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        return out  

Ruled-based data augmentation: we propose adding edges by cosine similarity and removing edges by degree centrality

In [14]:
def addEdgesSimilarity(data, similarity_threshold=0.8, max_new_edges=100):
    # Calculate cosine similarity for all node pairs
    nodeFeatures = data.x.cpu().numpy()
    similarity_matrix = cosine_similarity(nodeFeatures)
    new_edges = []
    numNodes = data.num_nodes
    for i in range(numNodes):
        for j in range(i+1, numNodes):
            if similarity_matrix[i, j] > similarity_threshold:
                new_edges.append((i, j))
                if len(new_edges) >= max_new_edges:
                    break
        if len(new_edges) >= max_new_edges:
            break
    newEdgesTensor = torch.tensor(new_edges, dtype=torch.long).t().to(data.edge_index.device)
    data.edge_index = torch.cat([data.edge_index, newEdgesTensor], dim=1)
    return data

In [10]:
def removeEdgesCentrality(data, remove_ratio=0.1):
    G = nx.Graph()
    edges = data.edge_index.t().cpu().numpy()
    G.add_edges_from(edges)
# Compute degree centrality for each node
    centrality = nx.degree_centrality(G)
    edgeImportance = [(u, v, centrality[u] + centrality[v]) for u, v in G.edges]
# Sort edges by importance and select edges to remove
    edgeImportance.sort(key=lambda x: x[2])
    removeCount = int(len(edgeImportance) * remove_ratio)
    edgesRemove = edgeImportance[:removeCount]    
# Remove low-centrality edges
    for u, v, _ in edgesRemove:
        G.remove_edge(u, v)    
# Update edge_index after removal
    data.edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().to(data.edge_index.device)    
    return data

Data Normalization

In [17]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(data.x.numpy())
data.x = torch.tensor(x_scaled, dtype=torch.float)
#agmentation based on similarity (specialized add and remove data as told in paper)
data = addEdgesSimilarity(data, similarity_threshold=0.8, max_new_edges=100)
data = removeEdgesCentrality(data, remove_ratio=0.1)
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
# For repporting inductive and tranductive results since it is stream based approach
inductiveResults = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mrr': []}
transductiveResults = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mrr': []}
# List to store all results for writing to file
log_output = []

Focal Loss definition for composite loss function

In [21]:
# To balance class ditribution    
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma 
        self.reduction = reduction

    def forward(self, inputs, targets):
        CeLoss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-CeLoss)
        # Focal Loss calculation
        FLoss = self.alpha * (1 - pt) ** self.gamma * CeLoss
        if self.reduction == 'mean':
            return torch.mean(FLoss)
        elif self.reduction == 'sum':
            return torch.sum(FLoss)
        else:
            return FLoss

Definition of Evaluation Metrics with Test Function

In [22]:
# MRR definition
def meanReciprocalRank(y_true, y_prob):
    ranks = []
    for true_label, prob in zip(y_true, y_prob):
        rank = np.where(np.argsort(prob)[::-1] == true_label)[0][0] + 1
        ranks.append(1 / rank)
    return np.mean(ranks)

# Test model function for both inductive and transductive reasoning
def test_model(model, test_loader, loss_fn, data_type="Inductive"):
    model.eval()
    test_losses = []
    test_correct = 0
    all_preds = []
    all_true = []
    all_probs = []
    with torch.no_grad():
        for batch in test_loader:
            out_test = model(batch)
            test_loss1 = loss_fn(out_test, batch.y.to(device))
            test_loss2 = loss_focal(out_test, batch.y.to(device))
            test_loss = 0.1 * test_loss1 + 0.9 * test_loss2  # Composite Loss Function definition
            test_losses.append(test_loss.item())
            _, pred_test = out_test.max(dim=1)
            test_correct += float(pred_test.eq(batch.y.to(device)).sum().item())
            all_preds.extend(pred_test.cpu().numpy())
            all_true.extend(batch.y.cpu().numpy())
            all_probs.extend(F.softmax(out_test, dim=1).cpu().numpy())

    test_loss = np.mean(test_losses)
    test_accuracy = test_correct / len(all_true)
    precision = precision_score(all_true, all_preds, average='weighted')
    recall = recall_score(all_true, all_preds, average='weighted')
    f1 = f1_score(all_true, all_preds, average='weighted')
    mrr = meanReciprocalRank(all_true, all_probs)

    outputResults = [f'{data_type} Reasoning (Test Loss): {test_loss:.4f}',f'{data_type} Reasoning (Test Accuracy): {test_accuracy:.4f}',f'{data_type} Reasoning (Precision): {precision:.4f}', f'{data_type} Reasoning (Recall): {recall:.4f}',
        f'{data_type} Reasoning(F1 Score): {f1:.4f}',
        f'{data_type} Reasoning(MRR): {mrr:.4f}']
    for result in outputResults:
        print(result)

    return test_accuracy, precision, recall, f1, mrr, outputResults

Training with Cross-validation to ensure robustness of the results

In [None]:
for fold, (train_val_idx, test_idx) in enumerate(skf.split(torch.arange(data.num_nodes), data.y.cpu().numpy())):
    log_output.append(f'Fold {fold+1}/{k_folds}')
    print(f'Fold {fold+1}/{k_folds}')
    # train test split
    train_idx, val_idx = train_test_split(train_val_idx, test_size=0.1, stratify=data.y[train_val_idx].cpu().numpy())
    train_idx = torch.tensor(train_idx, dtype=torch.long)
    val_idx = torch.tensor(val_idx, dtype=torch.long)
    test_idx = torch.tensor(test_idx, dtype=torch.long)

    # Inductive Reasoning (All information of test data will not be used during training)
    trainSubgraph = subgraph(train_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    valSubgraph = subgraph(val_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    testSubgraph_inductive = subgraph(test_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    # Transductive Reasoning (All node and edge information is accessible during training, except for the labels of the test nodes)
    globalSubgraph = subgraph(torch.arange(data.num_nodes), data.edge_index, relabel_nodes=False, num_nodes=data.num_nodes)
    train_data = Data(x=data.x[train_idx], edge_index=trainSubgraph[0], y=data.y[train_idx])
    val_data = Data(x=data.x[val_idx], edge_index=valSubgraph[0], y=data.y[val_idx])
    Inductive_test_data = Data(x=data.x[test_idx], edge_index=testSubgraph_inductive[0], y=data.y[test_idx])
    Transductive_test_data = Data(x=data.x, edge_index=globalSubgraph[0], y=data.y)

    batch_size = 64
    train_loader = DataLoader([train_data], batch_size=batch_size, shuffle=True)
    val_loader = DataLoader([val_data], batch_size=batch_size)
    Inductive_test_loader = DataLoader([Inductive_test_data], batch_size=batch_size)
    Transductive_test_loader = DataLoader([Transductive_test_data], batch_size=batch_size)
    num_nodeFeatures = data.num_node_features
    numOfclasses = len(data.y.unique())
    hidden_channels = 256
    lstm_hid = 128
    model = TemporalSAGE(num_nodeFeatures, hidden_channels, lstm_hid, numOfclasses).to(device)
    optim = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=500, gamma=0.5)
    loss_fn = torch.nn.CrossEntropyLoss()
    loss_focal = FocalLoss()

    for epoch in range(4000):
        for batch in train_loader:
            optim.zero_grad()
            out = model(batch)
            loss_ce = loss_fn(out, batch.y.to(device))
            lossFocalValue = loss_focal(out, batch.y.to(device))
            loss = 0.1 * loss_ce + 0.9 * lossFocalValue
            loss.backward()
            optim.step()
        scheduler.step()
        if (epoch + 1) % 100 == 0:
            model.eval()
            with torch.no_grad():
                val_losses = []
                val_correct = 0
                for batch in val_loader:
                    out_val = model(batch)
                    val_loss1 = loss_fn(out_val, batch.y.to(device))
                    val_loss2 = loss_focal(out_val, batch.y.to(device))
                    val_loss = 0.1 * val_loss1 + 0.9 * val_loss2  # 0.1* cross entropy + 0.9*focal loss
                    val_losses.append(val_loss.item())
                    _, pred_val = out_val.max(dim=1)
                    val_correct += float(pred_val.eq(batch.y.to(device)).sum().item())
                val_loss = np.mean(val_losses)
                val_accuracy = val_correct / len(val_data.y)
            log_output.append(f'Epoch: {epoch+1}, Loss: {loss_ce.item()}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy:.4f}, Learning Rate: {scheduler.get_last_lr()[0]:.6f}')  # Log for validation results
            print(f'Epoch: {epoch+1}, Loss: {loss_ce.item()}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy:.4f}, Learning Rate: {scheduler.get_last_lr()[0]:.6f}')
            model.train()
    inductive_accuracy, inductive_precision, inductive_recall, inductive_f1, inductive_mrr, inductive_log = test_model(model, Inductive_test_loader, loss_fn, "Inductive")
    transductive_accuracy, transductive_precision, transductive_recall, transductive_f1, transductive_mrr, transductive_log = test_model(model, Trasnductive_test_loader, loss_fn, "Transductive")

    inductiveResults['accuracy'].append(inductive_accuracy)
    inductiveResults['precision'].append(inductive_precision)
    inductiveResults['recall'].append(inductive_recall)
    inductiveResults['f1'].append(inductive_f1)
    inductiveResults['mrr'].append(inductive_mrr)

    transductive_results['accuracy'].append(transductive_accuracy)
    transductive_results['precision'].append(transductive_precision)
    transductive_results['recall'].append(transductive_recall)
    transductive_results['f1'].append(transductive_f1)
    transductive_results['mrr'].append(transductive_mrr)

    foldResults = f"Fold {fold+1} Results - Inductive Accuracy: {inductive_accuracy:.4f}, Transductive Accuracy: {transductive_accuracy:.4f}"
    print(foldResults)
    log_output.append(foldResults)  #Log for each fod results


Calulating means and standard deviations of performance results of folds to report them in addition to main results

In [None]:
def calculate_average_and_std(results_dict):
    averages = {}
    std_devs = {}
    for key, values in results_dict.items():
        averages[key] = np.mean(values)
        std_devs[key] = np.std(values)
    return averages, std_devs

inductive_avg, inductive_std = calculate_average_and_std(inductiveResults)
transductive_avg, transductive_std = calculate_average_and_std(transductiveResults)

In [25]:
output = []
output.append(f'Final Inductive Average Results - Accuracy: {inductive_avg["accuracy"]:.4f}, Precision: {inductive_avg["precision"]:.4f}, Recall: {inductive_avg["recall"]:.4f}, F1: {inductive_avg["f1"]:.4f}, MRR: {inductive_avg["mrr"]:.4f}')
output.append(f'Final Inductive Standard Deviations - Accuracy: {inductive_std["accuracy"]:.4f}, Precision: {inductive_std["precision"]:.4f}, Recall: {inductive_std["recall"]:.4f}, F1: {inductive_std["f1"]:.4f}, MRR: {inductive_std["mrr"]:.4f}')
output.append(f'Final Transductive Average Results - Accuracy: {transductive_avg["accuracy"]:.4f}, Precision: {transductive_avg["precision"]:.4f}, Recall: {transductive_avg["recall"]:.4f}, F1: {transductive_avg["f1"]:.4f}, MRR: {transductive_avg["mrr"]:.4f}')
output.append(f'Final Transductive Standard Deviations - Accuracy: {transductive_std["accuracy"]:.4f}, Precision: {transductive_std["precision"]:.4f}, Recall: {transductive_std["recall"]:.4f}, F1: {transductive_std["f1"]:.4f}, MRR: {transductive_std["mrr"]:.4f}')

In [None]:
# Printing the output to a .txt file
with open('.../results.txt', 'w') as f:
    for line in log_output:
        f.write(line + '\n')
    for line in output:
        f.write(line + '\n')

# Print the results
for line in output:
    print(line)