In [1]:
from ImportLocalData import loadData

In [2]:
from BalanceClassDistribution import AdjustClassSamples, NumberOfSamplesClass

In [None]:
# Import one of the custom KG files
# Call BalanceClass... to handle outliers if you need
# Please change path names based on your local files 
data = loadData('.../node_features.txt', '/home/gozde/GEREKLI_DOSYALAR/HMDB-STKG/edges.txt', '.../edge_features.txt', '.../node_labels.txt')
#data = AdjustClassSamples(data) #this is optional, yet in paper we used

Import required packages

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GAE
from torch.nn import LSTM
from torch_geometric.utils import subgraph
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from torch_geometric.data import Data

Definition of FusionGAT Model

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#GAT-LSTM based activityy recognition module with GAE based reconstruction for deper anlysis
class FusionGAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, lstm_hidden_size, out_channels):
        super(FusionGAT, self).__init__()
        # GAT-LSTM for node classification
        self.conv1 = GATConv(in_channels, hidden_channels, heads=2, concat=True)
        self.conv2 = GATConv(hidden_channels * 2, hidden_channels, heads=1, concat=False)
        self.lstm = LSTM(hidden_channels, lstm_hidden_size, batch_first=True)
        self.fc = torch.nn.Linear(lstm_hidden_size, out_channels)
        self.dropout = torch.nn.Dropout(p=0.5)

        # Graph Autoencoder for reconstruction loss
        self.encoder_conv1 = GATConv(in_channels, hidden_channels, heads=2, concat=True)
        self.encoder_conv2 = GATConv(hidden_channels * 2, hidden_channels, heads=1, concat=False)
        self.fc_decoder = torch.nn.Linear(hidden_channels, in_channels)  # For node reconstruction

    def forward(self, data):
        x, edge_index = data.x.to(device), data.edge_index.to(device)
        x_cls = self.conv1(x, edge_index)
        x_cls = F.elu(x_cls)
        x_cls = self.dropout(x_cls)
        x_cls = self.conv2(x_cls, edge_index)
        x_cls = F.elu(x_cls)
        x_cls = x_cls.view(data.num_nodes, -1, x_cls.size(1))
        lstm_out, _ = self.lstm(x_cls)
        out_cls = self.fc(lstm_out[:, -1, :])

        # Graph Autoencoder part
        x_enc = self.encoder_conv1(x, edge_index)
        x_enc = F.elu(x_enc)
        x_enc = self.encoder_conv2(x_enc, edge_index)
        out_reconstructed = self.fc_decoder(x_enc)

        return out_cls, out_reconstructed


In [7]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(data.x.cpu().numpy())  # Scaling data on the CPU
data.x = torch.tensor(x_scaled, dtype=torch.float).to(device)  # Move data back to GPU

# Split data with balanced number of classes before 5-Fold Cross Validation 
# This is optional. you can change number of fold
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# For keeping inductive and transductive results
inductive_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mrr': []}
transductive_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mrr': []}

# List to store all fold information and each epoch results for writing to file
log_output = []

Definition of Evaluation Metrics with Test Function

In [8]:
# MRR Definition
def meanReciprocalRank(y_true, y_prob):
    ranks = []
    for true_label, prob in zip(y_true, y_prob):
        rank = np.where(np.argsort(prob)[::-1] == true_label)[0][0] + 1
        ranks.append(1 / rank)
    return np.mean(ranks)

# Test model function for both inductive and transductive reasoning
def test_model(model, test_loadr, loss_fn, reconsruction_loss_fn, data_type="Inductive"):
    model.eval()
    testLosses = []
    testCorrect = 0
    allPreds = []
    allTrue = []
    allProbs = []

    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            out_test_cls, out_test_recon = model(batch)
            test_loss_cls = loss_fn(out_test_cls, batch.y)
            test_loss_recon = reconstruction_loss_fn(out_test_recon, batch.x)
            test_loss = test_loss_cls + 0.2 * test_loss_recon  # Composite Loss Function definition
            testLosses.append(test_loss.item())
            _, pred_test = out_test_cls.max(dim=1)
            testCorrect += float(pred_test.eq(batch.y).sum().item())
            
            allPreds.extend(pred_test.cpu().numpy())
            allTrue.extend(batch.y.cpu().numpy())
            allProbs.extend(F.softmax(out_test_cls, dim=1).cpu().numpy())

    testLoss = np.mean(testLosses)
    testAccuracy = testCorrect / len(allTrue)

    precision = precision_score(allTrue, allPreds, average='weighted')
    recall = recall_score(allTrue, allPreds, average='weighted')
    f1 = f1_score(allTrue, allPreds, average='weighted')
    mrr = meanReciprocalRank(allTrue, allProbs)

    outputResults = [f'{data_type} Reasoning (Test Los): {testLoss:.4f}', f'{data_type} Reasoning(Test Accuracy): {testAccuracy:.4f}', f'{data_type} Reasoning (Precision): {precision:.4f}',
        f'{data_type} Reasoning (Recall): {recall:.4f}', f'{data_type} Reasoning (F1 Score): {f1:.4f}', f'{data_type} Reasoning(MRR): {mrr:.4f}']
    
    for result in outputResults:
        print(result)
    return test_accuracy, precision, recall, f1, mrr, output_results

Training with Cross-validation to ensure robustness of the results

In [None]:
for fold, (train_val_idx, test_idx) in enumerate(skf.split(torch.arange(data.num_nodes), data.y.cpu().numpy())):
    log_output.append(f'Fold {fold+1}/{k_folds}')  # Log for current fold
    print(f'Fold {fold+1}/{k_folds}')
    
    # train test split
    train_idx, val_idx = train_test_split(train_val_idx, test_size=0.1, stratify=data.y[train_val_idx].cpu().numpy())
    train_idx = torch.tensor(train_idx, dtype=torch.long)
    val_idx = torch.tensor(val_idx, dtype=torch.long)
    test_idx = torch.tensor(test_idx, dtype=torch.long)
    
    # Inductive Reasoning: All information of test data will not be used during training
    train_subgraph = subgraph(train_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    val_subgraph = subgraph(val_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    test_subgraph_inductive = subgraph(test_idx, data.edge_index, relabel_nodes=True, num_nodes=data.num_nodes)
    # Transductive Reasoning: All node and edge information is accessible during training, except for the labels of the test nodes
    global_subgraph = subgraph(torch.arange(data.num_nodes), data.edge_index, relabel_nodes=False, num_nodes=data.num_nodes)
    train_data = Data(x=data.x[train_idx], edge_index=train_subgraph[0], y=data.y[train_idx])
    val_data = Data(x=data.x[val_idx], edge_index=val_subgraph[0], y=data.y[val_idx])
    test_data_inductive = Data(x=data.x[test_idx], edge_index=test_subgraph_inductive[0], y=data.y[test_idx])
    test_data_transductive = Data(x=data.x, edge_index=global_subgraph[0], y=data.y)

    # Data Loader for PyTorch Geometric
    batch_size = 32  #you can change batch_size
    train_loader = DataLoader([train_data], batch_size=batch_size, shuffle=True)
    val_loader = DataLoader([val_data], batch_size=batch_size)
    test_loader_inductive = DataLoader([test_data_inductive], batch_size=batch_size)
    test_loader_transductive = DataLoader([test_data_transductive], batch_size=batch_size)

    # Initializes the model:
    # Uses the Adam algorithm for parameter optimization
    # Performs dynamic learning rate adjustment
    # Defines a suitable function for loss calculation    
    num_node_features = data.num_node_features
    num_classes = len(data.y.unique())
    hidden_channels = 256
    lstm_hidden_size = 128
    model = FusionGAT(num_node_features, hidden_channels, lstm_hidden_size, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.5)
    classification_loss_fn = torch.nn.CrossEntropyLoss()
    reconstruction_loss_fn = torch.nn.MSELoss()

    # Training Phase
    for epoch in range(4000): #you can change number of epoch
        model.train()
        epoch_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out_cls, out_reconstructed = model(batch)

            # Classification and Reconstruction Loss calculation
            loss_cls = classification_loss_fn(out_cls, batch.y)
            loss_recon = reconstruction_loss_fn(out_reconstructed, batch.x)

            # Total composite loss calculation
            loss = loss_cls + 0.2 * loss_recon
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        # Update Scheduler
        scheduler.step()

        # Printing validation loss and accuracy every 100 epochs
        if (epoch + 1) % 100 == 0:
            model.eval()
            with torch.no_grad():
                val_losses = []
                val_correct = 0
                for batch in val_loader:
                    batch = batch.to(device)
                    out_val_cls, out_val_recon = model(batch)
                    val_loss_cls = classification_loss_fn(out_val_cls, batch.y)
                    val_loss_recon = reconstruction_loss_fn(out_val_recon, batch.x)
                    val_losses.append((val_loss_cls + 0.2 * val_loss_recon).item())
                    _, pred_val = out_val_cls.max(dim=1)
                    val_correct += float(pred_val.eq(batch.y).sum().item())
                val_loss = np.mean(val_losses)
                val_accuracy = val_correct / len(val_data.y)
            log_output.append(f'Epoch: {epoch+1}, Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Learning Rate: {scheduler.get_last_lr()[0]:.6f}')
            print(f'Epoch: {epoch+1}, Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Learning Rate: {scheduler.get_last_lr()[0]:.6f}')
    
# Test stpe for each fold
    inductiveAccuracy, inductivePrecision, inductiveRecall, inductive_f1, inductiveMrr, inductiveLog = test_model(model, test_loader_inductive, classification_loss_fn, reconstruction_loss_fn, "Inductive")
    transductiveAccuracy, transductivePrecision, transductiveRecall, transductive_f1, transductiveMrr, transductive_log = test_model(model, test_loader_transductive, classification_loss_fn, reconstruction_loss_fn, "Transductive")
    inductiveResults['accuracy'].append(inductiveAccuracy)
    inductiveResults['precision'].append(inductivePrecision)
    inductiveResults['recall'].append(inductiveRecall)
    inductiveResults['f1'].append(inductive_f1)
    inductiveResults['mrr'].append(inductiveMrr)
    transductiveResults['accuracy'].append(transductiveRccuracy)
    transductiveResults['precision'].append(transductiveRrecision)
    transductiveResults['recall'].append(transductiveRecall)
    transductiveResults['f1'].append(transductive_f1)
    transductiveResults['mrr'].append(transductiveMrr)

    foldResults = f"Fold {fold+1} Results - Inductive Accuracy: {inductiveAccuracy:.4f}, Transductive Accuracy: {transductiveAccuracy:.4f}"
    print(foldResults)
    log_output.append(foldResults)  # keep fold results 


Calculating means and standard deviations of performance results of folds

In [None]:
def calculateAverageStd(results_dict):
    averages = {}
    std_devs = {}
    for key, values in results_dict.items():
        averages[key] = np.mean(values)
        std_devs[key] = np.std(values)
    return averages, std_devs

inductive_avg, inductive_std = calculateAverageStd(inductiveResults)
transductive_avg, transductive_std = calculateAverageStd(transductiveResults)

Print the results to the terminal for observation the process

In [None]:
output = []
output.append(f'Final Inductive Average Results (Accuracy): {inductive_avg["accuracy"]:.4f}, Precision: {inductive_avg["precision"]:.4f}, Recall: {inductive_avg["recall"]:.4f}, F1: {inductive_avg["f1"]:.4f}, MRR: {inductive_avg["mrr"]:.4f}')
output.append(f'Final Inductive Stadard Deviations (Accuracy): {inductive_std["accuracy"]:.4f}, Precision: {inductive_std["precision"]:.4f}, Recall: {inductive_std["recall"]:.4f}, F1: {inductive_std["f1"]:.4f}, MRR: {inductive_std["mrr"]:.4f}')
output.append(f'Final Transductive Average Results (Accuracy): {transductive_avg["accuracy"]:.4f}, Precision: {transductive_avg["precision"]:.4f}, Recall: {transductive_avg["recall"]:.4f}, F1: {transductive_avg["f1"]:.4f}, MR: {transductive_avg["mrr"]:.4f}')
output.append(f'Final Tranductive Standard Devitions (Accuracy): {transductive_std["accuracy"]:.4f}, Precision: {transductive_std["precision"]:.4f}, Recall: {transductive_std["recall"]:.4f}, F1: {transductive_std["f1"]:.4f}, MRR: {transductive_std["mrr"]:.4f}')

Printing the output to a .txt file

In [None]:
with open('.../results.txt', 'w') as f:
    for line in log_output:
        f.write(line + '\n')
    for line in output:
        f.write(line + '\n')
for line in output:
    print(line)