<a href="https://colab.research.google.com/github/bodadineshreddy/indictrans2/blob/main/GNNA2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade torch-geometric torch-sparse torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
import torch
import os.path as osp
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import MoleculeNet
from torch_geometric.loader import DataLoader
from sklearn.metrics import roc_auc_score
import random
from collections import Counter
import os
from torch_geometric.data import download_url, extract_zip

# Download the dataset (if not already present)
if not os.path.exists('ogbg_molhiv'):
    # Updated download link to raw files
    download_url('https://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/ogbg_molhiv.zip', './')
    extract_zip('./ogbg_molhiv.zip', './')
    os.remove('./ogbg_molhiv.zip')
else:
    print("Dataset already downloaded and extracted.")

# Check if the dataset is already registered
# Manually add 'ogbg_molhiv' to MoleculeNet.names
if 'ogbg_molhiv' not in MoleculeNet.names:
    MoleculeNet.names['ogbg_molhiv'] = 'ogbg_molhiv'  # Add to dictionary

# Manually define the 'ogbg_molhiv' class
class ogbg_molhiv(MoleculeNet):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, 'ogbg_molhiv', transform, pre_transform, pre_filter)

    @property
    def url(self):
        # Define url as a property that returns the correct URL
        # This ensures it's treated as a string and not a function
        # Access the base URL directly from the class variable
        dataset_dir = MoleculeNet.url
        return osp.join(dataset_dir, 'ogbg_molhiv', 'raw', 'data.zip')


dataset = ogbg_molhiv(root=".")

# Data Loaders
def load_ogbg_molhiv(batch_size):
    """Loads the ogbg-molhiv dataset and prepares data loaders."""
    # Use the custom ogbg_molhiv class to load the dataset
    dataset = ogbg_molhiv(root=".")
    split_idx = dataset.get_idx_split()
    train_dataset = dataset[split_idx["train"]]
    valid_dataset = dataset[split_idx["valid"]]
    test_dataset = dataset[split_idx["test"]]

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, valid_loader, test_loader



Looking in links: https://data.pyg.org/whl/torch-2.0.0+cu118.html
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_sparse-0.6.18%2Bpt20cu118-cp311-cp311-linux_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_scatter-2.1.2%2Bpt20cu118-cp311-cp311-linux_x86_64.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.6 MB/s[0m e

Downloading https://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/ogbg_molhiv.zip


HTTPError: HTTP Error 404: Not Found

In [None]:
# 2. Anonymous Walk Embedding Generation
def generate_anonymous_walk_embeddings(graphs, walk_length, num_walks):
    """Generates anonymous walk embeddings for a list of graphs."""

    all_anonymous_walks = set()
    graph_embeddings = []

    for graph in graphs:
        edge_index = graph.edge_index
        num_nodes = graph.num_nodes

        anonymous_walk_counts = Counter()

        for _ in range(num_walks):
            start_node = random.randint(0, num_nodes - 1)
            walk = [start_node]
            for _ in range(walk_length - 1):
                neighbors = []
                for j in range(edge_index.shape[1]):
                  if edge_index[0,j] == walk[-1]:
                    neighbors.append(edge_index[1,j])
                if neighbors:
                    next_node = random.choice(neighbors)
                    walk.append(next_node)

            anonymous_walk = []
            for i in range(len(walk)):
              anonymous_walk.append(walk.index(walk[i]))

            anonymous_walk_str = tuple(anonymous_walk)
            anonymous_walk_counts[anonymous_walk_str] += 1
            all_anonymous_walks.add(anonymous_walk_str)

        graph_embedding = anonymous_walk_counts
        graph_embeddings.append(graph_embedding)

    # Create embedding matrix
    all_anonymous_walks = list(all_anonymous_walks)
    embedding_matrix = []
    for graph_embedding in graph_embeddings:
        row = [graph_embedding.get(walk, 0) for walk in all_anonymous_walks]
        embedding_matrix.append(row)

    return torch.tensor(embedding_matrix, dtype=torch.float32), all_anonymous_walks

In [None]:
# 3. Neural Network Model Definition
class SimpleClassifier(nn.Module):
    """A simple MLP classifier."""

    def __init__(self, input_dim, hidden_dim, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:
# 4. Training
def train(model, train_loader, optimizer, loss_fn, device):
    """Trains the model for one epoch."""
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        labels = batch.y.float().to(device)

        #Generate embeddings for the batch
        embeddings, _ = generate_anonymous_walk_embeddings(batch, walk_length=5, num_walks=100)
        embeddings = embeddings.to(device)

        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [None]:
# 5. Evaluation
def evaluate(model, data_loader, device):
    """Evaluates the model on the given data loader."""
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            labels = batch.y.float().to(device)

            #Generate embeddings for the batch
            embeddings, _ = generate_anonymous_walk_embeddings(batch, walk_length=5, num_walks=100)
            embeddings = embeddings.to(device)

            outputs = torch.sigmoid(model(embeddings))  # Assuming binary classification
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(outputs.cpu().numpy())
    return roc_auc_score(y_true, y_pred)

In [None]:
# Main Execution
if __name__ == "__main__":
    # Hyperparameters
    batch_size = 32
    hidden_dim = 64
    learning_rate = 0.001
    num_epochs = 10
    walk_length = 5  # Length of random walks
    num_walks = 100  # Number of walks per graph

    # Device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data Loaders
    train_loader, valid_loader, test_loader = load_ogbg_molhiv(batch_size)

    # Example: Generate embeddings for a small batch to get input dimension
    example_batch = next(iter(train_loader))
    example_embeddings, all_walks = generate_anonymous_walk_embeddings(
        example_batch, walk_length, num_walks
    )
    input_dim = example_embeddings.shape[1]  # Dimension of the embeddings

    # Model Initialization
    model = SimpleClassifier(input_dim, hidden_dim, 1).to(device)  # Output dim = 1 for binary classification

    # Loss Function and Optimizer
    loss_fn = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training Loop
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, loss_fn, device)
        valid_auc = evaluate(model, valid_loader, device)
        print(
            f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid AUC: {valid_auc:.4f}"
        )

    # Evaluation on Test Set
    test_auc = evaluate(model, test_loader, device)
    print(f"Test AUC: {test_auc:.4f}")