This is a clean generator for the generating graphs from the larger pdbs


First we do the imports and paths

In [15]:

import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from graphein.protein.config import ProteinGraphConfig, DSSPConfig
from graphein.protein.graphs import construct_graph
from graphein.protein.features.nodes.dssp import add_dssp_df
from graphein.protein.utils import download_pdb
import graphein.protein as gp
from functools import partial
from graphein.ml.conversion import GraphFormatConvertor
from graphein.protein.edges.distance import (add_peptide_bonds,
                                             add_hydrogen_bond_interactions,
                                             add_disulfide_interactions,
                                             add_ionic_interactions,
                                             add_aromatic_interactions,
                                             add_aromatic_sulphur_interactions,
                                             add_cation_pi_interactions
                                             )



# Set DSSP path environment variable this is mac specific
os.environ["DSSP_PATH"] = "/opt/homebrew/bin/mkdssp"
print(f"DSSP_PATH set to: {os.environ['DSSP_PATH']}")

DSSP_PATH set to: /opt/homebrew/bin/mkdssp


this is the config for the graphs properties

In [None]:
# Use this edge function set for complete biochemical interactions
all_edge_func = {"edge_construction_functions": [

    add_peptide_bonds,
    add_aromatic_interactions,
    add_hydrogen_bond_interactions,
    add_disulfide_interactions,
    add_ionic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    gp.add_hydrophobic_interactions,
    gp.add_salt_bridges]}
# not sure the salt bridges always work


# Use these metadata configurations for the properties you need
complete_config = {
    #"graph_metadata_functions": [gp.rsa, gp.secondary_structure], - these come from the dssp
    "node_metadata_functions": [gp.amino_acid_one_hot,
                                gp.meiler_embedding,
                                partial(gp.expasy_protein_scale, add_separate=True)]
}

# Combined configuration
full_config = gp.ProteinGraphConfig(**{**all_edge_func, **complete_config})


In [8]:
def generate_graph_no_dssp():
    graph = construct_graph(path="2X89.pdb", config=full_config)
    return graph


this is just the dssp stuff being spliced on to the graphein generated graph

In [9]:
def add_all_dssp_features_to_nodes(graph):
    """
    Extract all available DSSP features from the dataframe and add them to nodes.
    """
    if "dssp_df" not in graph.graph:
        print("No DSSP dataframe found in graph")
        return graph

    # Get the DSSP dataframe
    dssp_df = graph.graph["dssp_df"]

    # Print available columns in DSSP dataframe
    print(f"Available DSSP features: {list(dssp_df.columns)}")

    # Map of common DSSP column names and their descriptions
    dssp_features = {
        'ss': 'Secondary structure (ss_value)',
        'aa': 'Amino acid (dssp_aa)',
        'acc': 'Absolute solvent accessibility (acc)',
        'phi': 'Phi angle (phi)',
        'psi': 'Psi angle (psi)',
        'dssp_index': 'DSSP residue index (dssp_index)',
        'NH_O_1_relidx': 'First NH-O hydrogen bond relative index (nh_o1_relidx)',
        'NH_O_1_energy': 'First NH-O hydrogen bond energy (nh_o1_energy)',
        'O_NH_1_relidx': 'First O-NH hydrogen bond relative index (o_nh1_relidx)',
        'O_NH_1_energy': 'First O-NH hydrogen bond energy (o_nh1_energy)',
        'NH_O_2_relidx': 'Second NH-O hydrogen bond relative index (nh_o2_relidx)',
        'NH_O_2_energy': 'Second NH-O hydrogen bond energy (nh_o2_energy)',
        'O_NH_2_relidx': 'Second O-NH hydrogen bond relative index (o_nh2_relidx)',
        'O_NH_2_energy': 'Second O-NH hydrogen bond energy (o_nh2_energy)',
    }

    # Variation in column naming across different versions
    alt_column_names = {
        'ss': ['ss', 'SS', 'sec_struc'],
        'aa': ['aa', 'AA', 'amino_acid'],
        'acc': ['acc', 'ACC', 'accessibility'],
        'phi': ['phi', 'PHI'],
        'psi': ['psi', 'PSI'],
        'dssp_index': ['dssp_index', 'id'],
        'NH_O_1_relidx': ['NH_O_1_relidx', 'NH-O_1_relidx', 'NH_O_1_ridx'],
        'NH_O_1_energy': ['NH_O_1_energy', 'NH-O_1_energy'],
        'O_NH_1_relidx': ['O_NH_1_relidx', 'O-NH_1_relidx', 'O_NH_1_ridx'],
        'O_NH_1_energy': ['O_NH_1_energy', 'O-NH_1_energy'],
        'NH_O_2_relidx': ['NH_O_2_relidx', 'NH-O_2_relidx', 'NH_O_2_ridx'],
        'NH_O_2_energy': ['NH_O_2_energy', 'NH-O_2_energy'],
        'O_NH_2_relidx': ['O_NH_2_relidx', 'O-NH_2_relidx', 'O_NH_2_ridx'],
        'O_NH_2_energy': ['O_NH_2_energy', 'O-NH_2_energy'],
    }

    # Find the actual column names in the dataframe
    actual_columns = {}
    for feature, alternatives in alt_column_names.items():
        for alt in alternatives:
            if alt in dssp_df.columns:
                actual_columns[feature] = alt
                break

    print(f"Found {len(actual_columns)} DSSP features in the dataframe")

    # Track how many nodes were updated
    updated_nodes = 0
    features_added = set()

    # Different versions of Graphein might have different column names for chain & residue
    # Try to identify the correct column names
    chain_col = next((c for c in dssp_df.columns if 'chain' in c.lower()), None)
    res_num_col = next((c for c in dssp_df.columns if 'res' in c.lower() and 'num' in c.lower()), None)

    if not chain_col or not res_num_col:
        print(f"Could not identify chain and residue number columns in DSSP dataframe")
        print(f"Available columns: {dssp_df.columns}")
        return graph

    print(f"Using '{chain_col}' for chain ID and '{res_num_col}' for residue number")

    # Add features to each node
    for node, data in graph.nodes(data=True):
        # Extract chain and residue info from node ID
        parts = str(node).split(':')
        if len(parts) < 3:
            continue

        chain = parts[0]
        residue_num = parts[2]

        try:
            # Find matching row in DSSP dataframe
            mask = (dssp_df[chain_col] == chain) & (dssp_df[res_num_col] == int(residue_num))
            matching_rows = dssp_df[mask]

            if matching_rows.empty:
                continue

            # Add all available features to the node
            for feature, col_name in actual_columns.items():
                # Get the node attribute name from feature descriptions
                node_attr = dssp_features[feature].split('(')[1].split(')')[0] if '(' in dssp_features[feature] else feature

                # Add the feature to the node
                if col_name in matching_rows.columns:
                    data[node_attr] = matching_rows[col_name].values[0]
                    features_added.add(node_attr)

            updated_nodes += 1

        except (ValueError, KeyError) as e:
            # Skip this node if there are issues
            continue

    print(f"Updated {updated_nodes} out of {len(graph.nodes)} nodes")
    print(f"Added the following features to nodes: {sorted(list(features_added))}")

    return graph


In [10]:
def debug_graph(graph):
    ''' Debugging function to print basic graph info '''
    # Check graph type
    print(f"Graph type: {type(graph)}")

    # Print basic node info
    print(f"\nNumber of nodes: {len(graph.nodes())}")
    if len(graph.nodes()) > 0:
        sample_node = list(graph.nodes())[0]
        print(f"Sample node: {sample_node}")
        print(f"Sample node attributes: {graph.nodes[sample_node]}")

    # Print basic edge info
    print(f"\nNumber of edges: {len(graph.edges())}")
    if len(graph.edges()) > 0:
        sample_edge = list(graph.edges())[0]
        print(f"Sample edge: {sample_edge}")
        print(f"Sample edge attributes: {graph.edges[sample_edge]}")



In [11]:
# Run the test and debug
result = generate_graph_no_dssp()
debug_graph(result)

Output()

Graph type: <class 'networkx.classes.graph.Graph'>

Number of nodes: 744
Sample node: A:GLN:1
Sample node attributes: {'chain_id': 'A', 'residue_name': 'GLN', 'residue_number': 1, 'atom_type': 'CA', 'element_symbol': 'C', 'coords': array([-71.294,  32.652,  -7.308], dtype=float32), 'b_factor': 70.94999694824219, 'amino_acid_one_hot': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]), 'meiler': dim_1    1.56
dim_2    0.18
dim_3    3.95
dim_4   -0.22
dim_5    5.65
dim_6    0.35
dim_7    0.25
Name: GLN, dtype: float64, 'pka_cooh_alpha': 2.17, 'pka_nh3': 9.13, 'pka_rgroup': 7.0, 'isoelectric_points': 5.65, 'molecularweight': 146.0, 'numbercodons': 2.0, 'bulkiness': 14.45, 'polarityzimmerman': 3.53, 'polaritygrantham': 10.5, 'refractivity': 17.56, 'recognitionfactors': 87.0, 'hphob_eisenberg': -0.85, 'hphob_sweet': -0.91, 'hphob_woods': 0.2, 'hphob_doolittle': -3.5, 'hphob_manavalan': 11.76, 'hphob_leo': -0.71, 'hphob_black': 0.251, 'hphob_breese': 0.97, 'hphob_fauchere': 

In [12]:
graph_test = generate_graph_no_dssp()

config = ProteinGraphConfig()
config.dssp_config = DSSPConfig()

# Add DSSP dataframe
print("Adding DSSP dataframe...")
graph_test = add_dssp_df(graph_test, dssp_config=config.dssp_config)

# Add secondary structure to nodes - this is from the function above...
graph_test = add_all_dssp_features_to_nodes(graph_test)

debug_graph(graph_test)


Output()

Adding DSSP dataframe...
Available DSSP features: ['chain', 'resnum', 'icode', 'aa', 'ss', 'asa', 'phi', 'psi', 'dssp_index', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy']
Found 13 DSSP features in the dataframe
Using 'chain' for chain ID and 'resnum' for residue number
Updated 744 out of 744 nodes
Added the following features to nodes: ['dssp_aa', 'dssp_index', 'nh_o1_energy', 'nh_o1_relidx', 'nh_o2_energy', 'nh_o2_relidx', 'o_nh1_energy', 'o_nh1_relidx', 'o_nh2_energy', 'o_nh2_relidx', 'phi', 'psi', 'ss_value']
Graph type: <class 'networkx.classes.graph.Graph'>

Number of nodes: 744
Sample node: A:GLN:1
Sample node attributes: {'chain_id': 'A', 'residue_name': 'GLN', 'residue_number': 1, 'atom_type': 'CA', 'element_symbol': 'C', 'coords': array([-71.294,  32.652,  -7.308], dtype=float32), 'b_factor': 70.94999694824219, 'amino_acid_one_hot': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 

In [16]:
# this could take in a list of chains from the known nanos directory.

def extract_chain_subgraphs(graph):
    """
    Extract subgraphs for each chain in the input graph.

    Parameters:
    -----------
    graph : graphein graph object
        The input protein graph with multiple chains

    Returns:
    --------
    dict
        Dictionary mapping chain IDs to their respective subgraphs
    """
    # Get all unique chain IDs
    chains = set([data["chain_id"] for _, data in graph.nodes(data=True)])
    print(f"Available chains: {chains}")
    print(f"Graph name: {graph.name}")

    # Create a dictionary to store all subgraphs
    chain_subgraphs = {}

    # Extract subgraph for each chain
    for chain in chains:
        subgraph = gp.extract_subgraph_from_chains(graph, [chain])
        chain_subgraphs[chain] = subgraph
        print(f"Extracted subgraph for chain {chain} with {subgraph.number_of_nodes()} nodes")

    return chain_subgraphs

# Usage
chain_subgraphs = extract_chain_subgraphs(graph_test)

# Access individual chain subgraphs
s_g_A = chain_subgraphs.get("A")
if s_g_A:
    debug_graph(s_g_A)

# You can loop through all chain subgraphs if needed
for chain_id, subgraph in chain_subgraphs.items():
    print(f"Chain {chain_id}: {subgraph.number_of_nodes()} nodes, {subgraph.number_of_edges()} edges")

with open("2X89_nanobody_A.pkl", "wb") as f: # or json or whatever...
    pickle.dump(s_g_A, f)

Available chains: {'C', 'E', 'G', 'D', 'A', 'B', 'F'}
Graph name: 2X89
Extracted subgraph for chain C with 128 nodes
Extracted subgraph for chain E with 91 nodes
Extracted subgraph for chain G with 86 nodes
Extracted subgraph for chain D with 92 nodes
Extracted subgraph for chain A with 128 nodes
Extracted subgraph for chain B with 128 nodes
Extracted subgraph for chain F with 91 nodes
Graph type: <class 'networkx.classes.graph.Graph'>

Number of nodes: 128
Sample node: A:GLY:9
Sample node attributes: {'chain_id': 'A', 'residue_name': 'GLY', 'residue_number': 9, 'atom_type': 'CA', 'element_symbol': 'C', 'coords': array([-79.664,  15.809,  10.036], dtype=float32), 'b_factor': 60.4900016784668, 'amino_acid_one_hot': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'meiler': dim_1    0.00
dim_2    0.00
dim_3    0.00
dim_4    0.00
dim_5    6.07
dim_6    0.13
dim_7    0.15
Name: GLY, dtype: float64, 'pka_cooh_alpha': 2.34, 'pka_nh3': 9.6, 'pka_rgroup': 7.0, 'isoelectric_

Send on to pytorch geometric memory loader dataset or whatever and Now it would be possible to use this as a loop to get and split all of the pdbs to generate the graphs for each nano - based on the nano directory or or or