<a href="https://colab.research.google.com/github/carloea2/project-273a/blob/main/toe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!git clone https://github.com/seongjinyoon/toe.git

fatal: destination path 'toe' already exists and is not an empty directory.


In [7]:
!git pull

fatal: not a git repository (or any of the parent directories): .git


First, let's import the necessary libraries.

In [4]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [5]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os

Now, let's define the `OperatorGAT` class.

In [None]:
class OperatorGAT(nn.Module):
    """
    Graph Attention Network for learning operator embeddings.

    Architecture:
    - Input: Initial operator embeddings from nomic-embed-code
    - GAT layers: Learn attention weights between connected operators
    - Output: Refined operator embeddings
    """

    def __init__(self, input_dim, hidden_dim=256, output_dim=128, num_heads=4, num_layers=3, dropout=0.3):
        """
        Args:
            input_dim: Dimension of input embeddings (from nomic-embed-code)
            hidden_dim: Hidden dimension for GAT layers
            output_dim: Final embedding dimension
            num_heads: Number of attention heads per GAT layer
            num_layers: Number of GAT layers
            dropout: Dropout rate
        """
        super(OperatorGAT, self).__init__()

        self.num_layers = num_layers
        self.dropout = dropout

        # Input projection
        self.input_proj = nn.Linear(input_dim, hidden_dim)

        # GAT layers
        self.gat_layers = nn.ModuleList()
        for i in range(num_layers):
            if i == 0:
                # First layer
                self.gat_layers.append(
                    GATConv(hidden_dim, hidden_dim // num_heads, heads=num_heads, dropout=dropout)
                )
            elif i == num_layers - 1:
                # Last layer - output single head
                self.gat_layers.append(
                    GATConv(hidden_dim, output_dim, heads=1, concat=False, dropout=dropout)
                )
            else:
                # Middle layers
                self.gat_layers.append(
                    GATConv(hidden_dim, hidden_dim // num_heads, heads=num_heads, dropout=dropout)
                )

        # Layer normalization
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(hidden_dim if i < num_layers - 1 else output_dim)
            for i in range(num_layers)
        ])

    def forward(self, x, edge_index, batch=None):
        """
        Forward pass through GAT.

        Args:
            x: Node features [num_nodes, input_dim]
            edge_index: Edge connectivity [2, num_edges]
            batch: Batch assignment for nodes (for batched graphs)

        Returns:
            Node embeddings [num_nodes, output_dim]
        """
        # Input projection
        x = self.input_proj(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        # GAT layers with residual connections
        for i, (gat, norm) in enumerate(zip(self.gat_layers, self.layer_norms)):
            x_residual = x if i < self.num_layers - 1 else None

            # GAT layer
            x = gat(x, edge_index)
            x = norm(x)

            # Apply activation and dropout (except last layer)
            if i < self.num_layers - 1:
                x = F.elu(x)
                x = F.dropout(x, p=self.dropout, training=self.training)

                # Residual connection
                if x_residual is not None and x_residual.shape == x.shape:
                    x = x + x_residual

        return x

Next, we define the `WorkflowGraphDataset` class to handle loading the workflow data.

In [None]:
class WorkflowGraphDataset:
    """Dataset for loading workflows as PyTorch Geometric graphs."""

    def __init__(self, workflow_dir, operator_embeddings_file):
        """
        Args:
            workflow_dir: Directory containing workflow JSON files
            operator_embeddings_file: JSON file with operator embeddings
        """
        self.workflow_dir = Path(workflow_dir)
        self.workflow_files = list(self.workflow_dir.glob('*.json'))

        # Load operator embeddings
        print(f"Loading operator embeddings from {operator_embeddings_file}...")
        with open(operator_embeddings_file, 'r') as f:
            embeddings_data = json.load(f)

        # Create operator type to embedding mapping
        self.operator_type_to_embedding = {
            op_type: np.array(data['embedding'], dtype=np.float32)
            for op_type, data in embeddings_data.items()
        }

        # Check how many embeddings and embedding dimension
        self.embedding_dim = len(next(iter(self.operator_type_to_embedding.values())))
        print(f"Loaded {len(self.operator_type_to_embedding)} operator types")
        print(f"Embedding dimension: {self.embedding_dim}")

        # Create a mapping for unknown operators (use mean embedding)
        all_embeddings = np.array(list(self.operator_type_to_embedding.values()))
        self.unknown_embedding = np.mean(all_embeddings, axis=0)

    def load_workflow_graph(self, workflow_file):
        """
        Load a single workflow as a PyTorch Geometric graph.

        Returns:
            Data object with node features and edge indices
        """
        with open(workflow_file, 'r') as f:
            workflow = json.load(f)

        operators = workflow.get('operators', [])
        links = workflow.get('links', [])

        if len(operators) == 0:
            return None

        # Create operator ID to index mapping
        op_id_to_idx = {op['operatorID']: idx for idx, op in enumerate(operators)}

        # Create node features (operator embeddings)
        node_features = []
        operator_types = []

        for op in operators:
            op_type = op.get('operatorType', 'Unknown')
            operator_types.append(op_type)

            # Get embedding for this operator type
            embedding = self.operator_type_to_embedding.get(op_type, self.unknown_embedding)
            node_features.append(embedding)

        node_features = np.array(node_features, dtype=np.float32)

        # Create edge index from links
        edge_list = []
        for link in links:
            source_id = link['source']['operatorID']
            target_id = link['target']['operatorID']

            if source_id in op_id_to_idx and target_id in op_id_to_idx:
                source_idx = op_id_to_idx[source_id]
                target_idx = op_id_to_idx[target_id]
                edge_list.append([source_idx, target_idx])

        if len(edge_list) == 0:
            # No edges - create self-loops for isolated nodes
            edge_list = [[i, i] for i in range(len(operators))]

        edge_index = np.array(edge_list, dtype=np.int64).T

        # Convert to PyTorch tensors
        x = torch.tensor(node_features, dtype=torch.float32)
        edge_index = torch.tensor(edge_index, dtype=torch.long)

        # Create Data object
        data = Data(x=x, edge_index=edge_index)
        data.operator_types = operator_types
        data.workflow_file = str(workflow_file)

        return data

    def __len__(self):
        return len(self.workflow_files)

    def __getitem__(self, idx):
        workflow_file = self.workflow_files[idx]
        return self.load_workflow_graph(workflow_file)

Here are the functions for the graph reconstruction loss and extracting operator embeddings.

In [None]:
def create_graph_reconstruction_loss(embeddings, edge_index, negative_samples=5):
    """
    Graph reconstruction loss: predict edges from node embeddings.
    Encourages connected operators to have similar embeddings.

    Args:
        embeddings: Node embeddings [num_nodes, embedding_dim]
        edge_index: Edge connectivity [2, num_edges]
        negative_samples: Number of negative samples per positive edge

    Returns:
        Binary cross-entropy loss
    """
    # Positive edges
    src_embed = embeddings[edge_index[0]]
    dst_embed = embeddings[edge_index[1]]

    # Compute similarity (dot product)
    pos_scores = torch.sum(src_embed * dst_embed, dim=1)
    pos_loss = -F.logsigmoid(pos_scores).mean()

    # Negative sampling
    num_nodes = embeddings.size(0)
    num_edges = edge_index.size(1)

    # Sample random negative edges
    neg_src = torch.randint(0, num_nodes, (num_edges * negative_samples,), device=embeddings.device)
    neg_dst = torch.randint(0, num_nodes, (num_edges * negative_samples,), device=embeddings.device)

    neg_src_embed = embeddings[neg_src]
    neg_dst_embed = embeddings[neg_dst]

    neg_scores = torch.sum(neg_src_embed * neg_dst_embed, dim=1)
    neg_loss = -F.logsigmoid(-neg_scores).mean()

    return pos_loss + neg_loss


def train_epoch(model, dataloader, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    num_graphs = 0

    for batch in tqdm(dataloader, desc="Training"):
        if batch is None:
            continue

        batch = batch.to(device)
        optimizer.zero_grad()

        # Forward pass
        embeddings = model(batch.x, batch.edge_index, batch.batch)

        # Compute loss (link prediction on the graph)
        loss = create_graph_reconstruction_loss(embeddings, batch.edge_index)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_graphs += 1

    return total_loss / max(num_graphs, 1)


def extract_operator_embeddings(model, dataset, device):
    """
    Extract final operator embeddings by aggregating across all workflows.

    For each operator type, we average the embeddings learned across all
    workflows where that operator appears.
    """
    model.eval()

    # Dictionary to collect embeddings for each operator type
    operator_embeddings = {}

    with torch.no_grad():
        for i in tqdm(range(len(dataset)), desc="Extracting embeddings"):
            data = dataset[i]
            if data is None:
                continue

            data = data.to(device)

            # Get embeddings for this workflow
            embeddings = model(data.x, data.edge_index)
            embeddings = embeddings.cpu().numpy()

            # Aggregate by operator type
            for op_type, embed in zip(data.operator_types, embeddings):
                if op_type not in operator_embeddings:
                    operator_embeddings[op_type] = []
                operator_embeddings[op_type].append(embed)

    # Average embeddings for each operator type
    final_embeddings = {}
    for op_type, embed_list in operator_embeddings.items():
        final_embeddings[op_type] = {
            'embedding': np.mean(embed_list, axis=0).tolist(),
            'embedding_dim': len(embed_list[0]),
            'num_occurrences': len(embed_list)
        }

    return final_embeddings

Finally, here is the main execution block to set up and train the model.

In [None]:
def main():
    """Main training function."""

    # Configuration
    WORKFLOW_DIR = 'workflows_selected'
    OPERATOR_EMBEDDINGS_FILE = 'operator_embeddings2.json'  # Initial embeddings from nomic-embed-code
    OUTPUT_FILE = 'operator_embeddings_gat.json'  # Final embeddings after GAT

    # Hyperparameters
    HIDDEN_DIM = 256
    OUTPUT_DIM = 128
    NUM_HEADS = 4
    NUM_LAYERS = 3
    DROPOUT = 0.3
    BATCH_SIZE = 32
    NUM_EPOCHS = 50
    LEARNING_RATE = 0.001

    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load dataset
    print("\n" + "="*80)
    print("Loading workflow dataset...")
    print("="*80)
    dataset = WorkflowGraphDataset(WORKFLOW_DIR, OPERATOR_EMBEDDINGS_FILE)

    # Filter out None graphs
    valid_graphs = [dataset[i] for i in range(len(dataset)) if dataset[i] is not None]
    print(f"\nLoaded {len(valid_graphs)} valid workflow graphs")

    # Create dataloader
    dataloader = DataLoader(valid_graphs, batch_size=BATCH_SIZE, shuffle=True)

    # Initialize model, optimizer, and scheduler (if needed)
    print("\n" + "="*80)
    print("Initializing model and optimizer...")
    print("="*80)
    model = OperatorGAT(
        input_dim=dataset.embedding_dim,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        num_heads=NUM_HEADS,
        num_layers=NUM_LAYERS,
        dropout=DROPOUT
    ).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Training loop
    print("\n" + "="*80)
    print(f"Starting training for {NUM_EPOCHS} epochs...")
    print("="*80)
    for epoch in range(1, NUM_EPOCHS + 1):
        loss = train_epoch(model, dataloader, optimizer, device)
        print(f"Epoch {epoch}/{NUM_EPOCHS}, Loss: {loss:.4f}")

    # Extract and save final embeddings
    print("\n" + "="*80)
    print("Extracting and saving final operator embeddings...")
    print("="*80)
    final_embeddings = extract_operator_embeddings(model, dataset, device)

    with open(OUTPUT_FILE, 'w') as f:
        json.dump(final_embeddings, f, indent=4)

    print(f"\nFinal operator embeddings saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

In [2]:
# Read the content of the file
with open('/content/toe/train_gat_embeddings.py', 'r') as f:
    file_content = f.read()

# Print the content
print(file_content)

"""
Train Graph Attention Network (GAT) to learn operator embeddings that combine:
1. Semantic information from code embeddings (nomic-embed-code)
2. Structural information from workflow graphs

This script:
- Loads operator code embeddings as initial node features
- Loads workflow graphs from JSON files
- Trains a GAT to learn better embeddings by considering how operators are connected
- Outputs final operator embeddings that capture both semantics and usage patterns
"""

import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os


class OperatorGAT(nn.Module):
    """
    Graph Attention Network for learning operator embeddings.

    Architecture:
    - Input: Initial operator embeddings from nomic-embed-code
    - GAT layers: Learn attention weights between connected operators
    -