<a href="https://colab.research.google.com/github/aroos2147/gigagraphgoo/blob/main/ggg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Giga Graph Goo

## Setup
In this section we prepare the environment by installing the necessary packages and importing the required libraries

In [7]:
# Dependencies
!pip install torch_geometric
!pip install rdkit



In [8]:
# Imports
import torch
from torch_geometric.data import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
import random as rd
import os
from rdkit import Chem
from rdkit.Chem import rdmolops
from google.colab import drive
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from sklearn.model_selection import ParameterGrid


## Get the data
We will be working with a subset of the Malnet dataset. The subset consists of the function call graphs of benign android applications and of android applications associated with the Artemis trojan. The data is stored on a private google drive.

In [9]:
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
def read_edgelist_to_graph(filepath, label):
    try:
        edge_index = []
        with open(filepath, 'r') as f:
            for line in f:
                if line.startswith('#'):
                    continue
                source, target = map(int, line.strip().split())
                edge_index.append([source, target])

        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        x = torch.ones(edge_index.max().item() + 1, 1, dtype=torch.float)
        data = Data(x=x, edge_index=edge_index, y=torch.tensor([label]))
        return data
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return None

benign_dir = '/content/drive/My Drive/malnet/benign/'
artemis_dir = '/content/drive/My Drive/malnet/artemis/'

data_list = []

# Load graphs from the 'benign' directory
for filename in os.listdir(benign_dir):
    if filename.endswith(".edgelist"):
        filepath = os.path.join(benign_dir, filename)
        # label 0 is non malicious
        graph_data = read_edgelist_to_graph(filepath, 0)
        if graph_data:
            data_list.append(graph_data)

# Load graphs from the 'artemis' directory
for filename in os.listdir(artemis_dir):
    if filename.endswith(".edgelist"):
        filepath = os.path.join(artemis_dir, filename)
        # label 1 is malicious
        graph_data = read_edgelist_to_graph(filepath, 1)
        if graph_data:
            data_list.append(graph_data)

print(f"Loaded {len(data_list)} graphs.")

Loaded 2000 graphs.


## Prepare the data
The data is being shuffeled and split into suitable train, validate and test splits which are used in preparing, training and evaluating the performance of the model.

In [11]:
# Shuffle the data list
rd.shuffle(data_list)

# Define the split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Calculate the split indices
train_size = int(len(data_list) * train_ratio)
val_size = int(len(data_list) * val_ratio)
test_size = len(data_list) - train_size - val_size

# Split the data
train_data = data_list[:train_size]
val_data = data_list[train_size:train_size + val_size]
test_data = data_list[train_size + val_size:]

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 1600
Validation size: 200
Test size: 200


## Define a Model
In the following we define a generic graph convolutional network, the hyperparameters of which are parameterized in the constructor. This allows us to perform a grid search over a wide range of hyperparamter options in order to find a good combination for the final model structure. This is done using the training set to train a model for each hyperparameter combination and using the validation set to test the best performing model. The hyperparameter combnation of the best model is then used in training the final classifier.

In [12]:
class GCNClassifier(nn.Module):
    def __init__(self, hidden_dim, num_layers, dropout, activation=F.relu):
        super(GCNClassifier, self).__init__()

        self.dropout = dropout
        self.activation = activation

        # Initialize GCN layers
        self.layers = nn.ModuleList()
        self.layers.append(GCNConv(hidden_dim, hidden_dim))  # First layer
        for _ in range(num_layers - 1):
            self.layers.append(GCNConv(hidden_dim, hidden_dim))  # Additional layers

        # Output layer
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, edge_index, batch, num_nodes):
        node_embeddings = nn.Parameter(torch.randn(num_nodes, self.layers[0].in_channels))  # Size: (num_nodes, hidden_dim)
        x = node_embeddings

        # Apply GCN layers
        for layer in self.layers:
            x = layer(x, edge_index)
            x = self.activation(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

        # Apply global mean pooling to get graph-level representation
        x = global_mean_pool(x, batch)

        # Output layer
        x = self.fc(x)
        return x

In [23]:
def train(model, train_loader, val_loader, optimizer, criterion, device, patience=5, epochs=100):
    model.train()  # Set the model to training mode
    total_loss = 0
    best_val_loss = float('inf')  # Initialize the best validation loss as infinity
    epochs_without_improvement = 0  # Track how many epochs have passed without improvement
    best_model_state = None  # To save the best model state

    for epoch in range(epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0

        for data in train_loader:
            data = data.to(device)  # Move the data to the appropriate device (GPU or CPU)
            optimizer.zero_grad()  # Zero the gradients before the backward pass

            num_nodes = data.num_nodes  # Get the number of nodes in the current graph

            # Forward pass
            out = model(data.edge_index, data.batch, num_nodes)

            # Compute loss (ensure the output is squeezed to match the shape of data.y)
            loss = criterion(out.squeeze(), data.y.float().to(device))
            loss.backward()  # Backward pass to calculate gradients

            optimizer.step()  # Update model parameters

            total_loss += loss.item() * data.num_graphs  # Accumulate the loss

        avg_train_loss = total_loss / len(train_loader.dataset)  # Compute average loss per graph

        # Validation phase
        model.eval()  # Set the model to evaluation mode
        val_loss = 0
        with torch.no_grad():
            for data in val_loader:
                data = data.to(device)
                num_nodes = data.num_nodes  # Get the number of nodes for the validation graph
                out = model(data.edge_index, data.batch, num_nodes)
                loss = criterion(out.squeeze(), data.y.float().to(device))
                val_loss += loss.item() * data.num_graphs

        avg_val_loss = val_loss / len(val_loader.dataset)  # Compute average validation loss

        # Early Stopping Check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            best_model_state = model.state_dict()  # Save the best model state
        else:
            epochs_without_improvement += 1

        # Stop early if no improvement for 'patience' epochs
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            model.load_state_dict(best_model_state)  # Restore the best model state
            break

        print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

    return model

In [14]:
def evaluate(model, data_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for data in data_loader:
            data = data.to(device)
            num_nodes = data.num_nodes  # Get the number of nodes for the current graph

            out = model(data.edge_index, data.batch, num_nodes)
            loss = criterion(out.squeeze(), data.y.float().to(device))
            total_loss += loss.item() * data.num_graphs  # Accumulate loss

            # Binary classification accuracy
            pred = torch.sigmoid(out.squeeze()) > 0.5
            correct += pred.eq(data.y.to(device)).sum().item()
            total += data.num_graphs

    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = correct / total
    return avg_loss, accuracy

In [15]:
def train_model(model, train_loader, val_loader, epochs, learning_rate, dropout, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.BCEWithLogitsLoss()

    model = model.to(device)

    for epoch in range(epochs):
        train_loss = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)

    test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
    print(f'Validation Loss: {test_loss:.4f}, Validation Accuracy: {test_accuracy:.4f}')

In [16]:
def grid_search(train_loader, val_loader, test_loader, device, patience=10):
    # Define hyperparameter grid
    param_grid = {
        'hidden_dim': [64],
        'num_layers': [2],
        'dropout': [0.5, 0.3],
        'learning_rate': [0.001]
    }

    # Initialize grid search
    best_model = None
    best_val_loss = float('inf')  # Initialize best validation loss as infinity
    best_params = None

    # Perform grid search
    for params in ParameterGrid(param_grid):
        print(f"Training with parameters: {params}")

        # Initialize the model with the current hyperparameters
        model = GCNClassifier(
            hidden_dim=params['hidden_dim'],
            num_layers=params['num_layers'],
            dropout=params['dropout'],
            activation=F.relu
        ).to(device)

        # Optimizer and loss function
        optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
        criterion = torch.nn.BCEWithLogitsLoss()

        # Train the model using early stopping with the train and validation sets
        model = train(model, train_loader, val_loader, optimizer, criterion, device, patience)

        # Evaluate on the validation set
        val_loss, _ = evaluate(model, val_loader, criterion, device)

        # Save the best model based on validation loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model
            best_params = params

    # Final evaluation on the test set using the best model
    test_loss, test_accuracy = evaluate(best_model, test_loader, criterion, device)
    print(f"Final Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    return best_model, best_params

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_model, best_params = grid_search(train_loader, val_loader, test_loader, device, patience=10)
print(f"Best Hyperparameters: {best_params}")


Training with parameters: {'dropout': 0.5, 'hidden_dim': 64, 'learning_rate': 0.001, 'num_layers': 2}
Epoch 1/50 - Train Loss: 0.6924 - Val Loss: 0.6913
Epoch 2/50 - Train Loss: 0.6929 - Val Loss: 0.6912
Epoch 3/50 - Train Loss: 0.6881 - Val Loss: 0.6872
Epoch 4/50 - Train Loss: 0.6887 - Val Loss: 0.6860
Epoch 5/50 - Train Loss: 0.6882 - Val Loss: 0.6878
Epoch 6/50 - Train Loss: 0.6886 - Val Loss: 0.6845
Epoch 7/50 - Train Loss: 0.6883 - Val Loss: 0.6881
Epoch 8/50 - Train Loss: 0.6864 - Val Loss: 0.6830
Epoch 9/50 - Train Loss: 0.6831 - Val Loss: 0.6826
Epoch 10/50 - Train Loss: 0.6825 - Val Loss: 0.6921
Epoch 11/50 - Train Loss: 0.6827 - Val Loss: 0.6825
Epoch 12/50 - Train Loss: 0.6799 - Val Loss: 0.6857
Epoch 13/50 - Train Loss: 0.6790 - Val Loss: 0.6866
Epoch 14/50 - Train Loss: 0.6791 - Val Loss: 0.6784
Epoch 15/50 - Train Loss: 0.6768 - Val Loss: 0.6739
Epoch 16/50 - Train Loss: 0.6744 - Val Loss: 0.6800
Epoch 17/50 - Train Loss: 0.6711 - Val Loss: 0.6804
Epoch 18/50 - Train Los

## Train the Model
The model is now fixed in terms of hyperparameters and will be trained using the union of the training and validation sets.

Best Hyperparameters: \{'dropout': 0.5, 'hidden_dim': 64, 'learning_rate': 0.001, 'num_layers': 2}

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GCNClassifier(
            hidden_dim=28,
            num_layers=2,
            dropout=0.5,
            activation=F.relu
        ).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()


In [47]:
combined_data = train_data + val_data
combined_loader = DataLoader(combined_data, batch_size=32, shuffle=True)

In [48]:
model = train(model, combined_loader, test_loader, optimizer, criterion, device, patience=10)

Epoch 1/100 - Train Loss: 0.6938 - Val Loss: 0.6897
Epoch 2/100 - Train Loss: 0.6918 - Val Loss: 0.6897
Epoch 3/100 - Train Loss: 0.6903 - Val Loss: 0.6875
Epoch 4/100 - Train Loss: 0.6891 - Val Loss: 0.6875
Epoch 5/100 - Train Loss: 0.6869 - Val Loss: 0.6840
Epoch 6/100 - Train Loss: 0.6893 - Val Loss: 0.6857
Epoch 7/100 - Train Loss: 0.6871 - Val Loss: 0.6877
Epoch 8/100 - Train Loss: 0.6839 - Val Loss: 0.6844
Epoch 9/100 - Train Loss: 0.6831 - Val Loss: 0.6836
Epoch 10/100 - Train Loss: 0.6839 - Val Loss: 0.6845
Epoch 11/100 - Train Loss: 0.6823 - Val Loss: 0.6827
Epoch 12/100 - Train Loss: 0.6827 - Val Loss: 0.6794
Epoch 13/100 - Train Loss: 0.6790 - Val Loss: 0.6822
Epoch 14/100 - Train Loss: 0.6785 - Val Loss: 0.6875
Epoch 15/100 - Train Loss: 0.6774 - Val Loss: 0.6751
Epoch 16/100 - Train Loss: 0.6766 - Val Loss: 0.6711
Epoch 17/100 - Train Loss: 0.6750 - Val Loss: 0.6766
Epoch 18/100 - Train Loss: 0.6739 - Val Loss: 0.6735
Epoch 19/100 - Train Loss: 0.6694 - Val Loss: 0.6647
Ep

## Evaluate the Model
The performance of the model is tested by evaluating the classificaiton accuracy of the trained model on the test dataset, which the model has never seen before.

In [49]:
test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
print(f"Final Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Final Test Loss: 0.5926, Test Accuracy: 0.6950
