In [1]:
import torch
import random
import dgl
import os

In [2]:
def preProcessData(data, data_name="mutagenicity", processed_data_dir="processed_data"):
    # Unpack the data into graphs and labels
    graphs, labels = zip(*data)
    print(type(graphs))
    # Preprocess each graph in the dataset
    for graph in graphs:
        # Compute the shortest distance matrix
        shortest_dist_matrix = dgl.shortest_dist(graph)

        # Calculate normalization distance matrix (binary matrix where 1 means the nodes are connected)
        normalization_distance_matrix = 1 * (
            shortest_dist_matrix[..., :, None] == shortest_dist_matrix[..., None, :]
        ).sum(-1)

        # Calculate distance matrix (inverse of shortest distance)
        distance_matrix = 1 / (1 + shortest_dist_matrix)

        # Add these matrices as node features
        graph.ndata["normalization_distance_matrix"] = normalization_distance_matrix
        graph.ndata["distance_matrix"] = distance_matrix.float()

    # Ensure the directory exists for saving the processed data
    os.makedirs(processed_data_dir, exist_ok=True)

    # Define the path to save the entire dataset
    processed_data_path = os.path.join(processed_data_dir, f"{data_name}_processed")

    # Optionally save labels along with graphs (in a dictionary)
    labels_dict = {"labels": torch.tensor(labels)}
    
    # Save the entire dataset (graphs + labels) using dgl.save_graphs
    dgl.save_graphs(processed_data_path + "_graphs.bin", list(graphs), labels=labels_dict)

    print(f"Entire processed dataset saved to {processed_data_path}_graphs.bin")


In [3]:
import os
import dgl
import random
import torch
from dgl.data import LegacyTUDataset
from preprocessdata import preProcessData


def getData(processed_data_dir="processed_data/mutagenicity_processed_graphs.bin"):
    if os.path.exists(processed_data_dir):
        graphs, label_dict = dgl.load_graphs(processed_data_dir)
    else:
        data = LegacyTUDataset("Mutagenicity")
        preProcessData(data)  # Your preprocessing function
        graphs, label_dict = dgl.load_graphs(processed_data_dir)

    # Get labels from the label_dict (assuming each graph corresponds to a label)
    labels = label_dict["labels"]

    # Randomly select 500 graphs from the dataset
    selected_indices = random.sample(range(len(graphs)), 500)
    selected_graphs = [graphs[i] for i in selected_indices]
    selected_labels = [labels[i] for i in selected_indices]

    # Split the selected 500 graphs into 300 for training, 100 for validation, and 100 for testing
    train_graphs, valid_graphs, test_graphs = (
        selected_graphs[:300],
        selected_graphs[300:400],
        selected_graphs[400:],
    )
    train_labels, valid_labels, test_labels = (
        selected_labels[:300],
        selected_labels[300:400],
        selected_labels[400:],
    )

    # Prepare the train, valid, and test data as pairs of graph and label
    train_data = list(zip(train_graphs, train_labels))
    valid_data = list(zip(valid_graphs, valid_labels))
    test_data = list(zip(test_graphs, test_labels))

    # Determine the number of features and classes
    num_feats = (
        selected_graphs[0].ndata["feat"].shape[1]
    )  # Assuming the first graph has the feature shape

    # Determine the number of classes based on the unique labels
    num_class = len(
        torch.unique(torch.tensor(labels)).tolist()
    )  # Use unique labels from the dataset

    return train_data, valid_data, test_data, num_feats, num_class


In [4]:
train_loader = getData()

  torch.unique(torch.tensor(labels)).tolist()


In [5]:
train_loader[0]

[(Graph(num_nodes=16, num_edges=32,
        ndata_schemes={'distance_matrix': Scheme(shape=(16,), dtype=torch.float32), 'normalization_distance_matrix': Scheme(shape=(16,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(14,), dtype=torch.float32)}
        edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
  tensor(0)),
 (Graph(num_nodes=23, num_edges=46,
        ndata_schemes={'distance_matrix': Scheme(shape=(23,), dtype=torch.float32), 'normalization_distance_matrix': Scheme(shape=(23,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(14,), dtype=torch.float32)}
        edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
  tensor(0)),
 (Graph(num_nodes=46, num_edges=90,
        ndata_schemes={'distance_matrix': Scheme(shape=(46,), dtype=torch.float32), 'normalization_distance_matrix': Scheme(shape=(46,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(1

In [6]:
for graph, label in train_loader[0]:
    print(graph, label)
    

Graph(num_nodes=16, num_edges=32,
      ndata_schemes={'distance_matrix': Scheme(shape=(16,), dtype=torch.float32), 'normalization_distance_matrix': Scheme(shape=(16,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(14,), dtype=torch.float32)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}) tensor(0)
Graph(num_nodes=23, num_edges=46,
      ndata_schemes={'distance_matrix': Scheme(shape=(23,), dtype=torch.float32), 'normalization_distance_matrix': Scheme(shape=(23,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(14,), dtype=torch.float32)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}) tensor(0)
Graph(num_nodes=46, num_edges=90,
      ndata_schemes={'distance_matrix': Scheme(shape=(46,), dtype=torch.float32), 'normalization_distance_matrix': Scheme(shape=(46,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(14,), dtype=torch.float32)}