In [None]:
from graphein.ml.conversion import GraphFormatConvertor
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from graphein.protein.utils import get_obsolete_mapping
import pandas as pd
import os
from tqdm.notebook import tqdm

In [None]:
# CONFIGS
import graphein.protein as gp
from functools import partial
from graphein.ml.conversion import GraphFormatConvertor
from graphein.protein.edges.distance import (add_peptide_bonds,
                                             add_hydrogen_bond_interactions,
                                             add_disulfide_interactions,
                                             add_ionic_interactions,
                                             add_aromatic_interactions,
                                             add_aromatic_sulphur_interactions,
                                             add_cation_pi_interactions
                                             )


# 1: Distance-based
dist_edge_func = {"edge_construction_functions": [partial(gp.add_distance_threshold, threshold=5, long_interaction_threshold=0)]}

# 2: Biochemical interactions, select set
select_edge_func = {"edge_construction_functions": [add_peptide_bonds,
                                                    add_hydrogen_bond_interactions,
                                                    add_disulfide_interactions,
                                                    add_ionic_interactions,
                                                    gp.add_salt_bridges]}

# 3: Biochemical interactions, expanded set
all_edge_func = {"edge_construction_functions": [add_peptide_bonds,
                                                 add_aromatic_interactions,
                                                 add_hydrogen_bond_interactions,
                                                 add_disulfide_interactions,
                                                 add_ionic_interactions,
                                                 add_aromatic_sulphur_interactions,
                                                 add_cation_pi_interactions,
                                                 gp.add_hydrophobic_interactions,
                                                 gp.add_vdw_interactions,
                                                 gp.add_backbone_carbonyl_carbonyl_interactions,
                                                 gp.add_salt_bridges]}

In [None]:

# A: Just one-hot encodings
one_hot = {"node_metadata_functions" : [gp.amino_acid_one_hot]}

# B: Selected biochemical features
all_graph_metadata = {"graph_metadata_functions" : [gp.rsa,
                                                    gp.secondary_structure]}
all_node_metadata = {"node_metadata_functions" : [gp.amino_acid_one_hot,
                                                  gp.meiler_embedding,
                                                  partial(gp.expasy_protein_scale, add_separate=True)],
                     "dssp_config": gp.DSSPConfig()
                     }


config_1A = gp.ProteinGraphConfig(**{**dist_edge_func, **one_hot})
config_1B = gp.ProteinGraphConfig(**{**dist_edge_func, **all_graph_metadata, **all_node_metadata})

config_2A = gp.ProteinGraphConfig(**{**select_edge_func, **one_hot})
config_2B = gp.ProteinGraphConfig(**{**select_edge_func, **all_graph_metadata, **all_node_metadata})

config_3A = gp.ProteinGraphConfig(**{**all_edge_func, **one_hot})
config_3B = gp.ProteinGraphConfig(**{**all_edge_func, **all_graph_metadata, **all_node_metadata})

In [None]:

from graphein.ml import InMemoryProteinGraphDataset
import os
# 1: Distance-based
dist_edge_func = {"edge_construction_functions": [partial(gp.add_distance_threshold, threshold=5, long_interaction_threshold=0)]}

# 2: Biochemical interactions, select set
select_edge_func = {"edge_construction_functions": [add_peptide_bonds,
                                                    add_hydrogen_bond_interactions,
                                                    ]}

# 3: Biochemical interactions, expanded set
all_edge_func = {"edge_construction_functions": [add_peptide_bonds,
                                                 add_aromatic_interactions,
                                                 add_hydrogen_bond_interactions,
                                                 add_disulfide_interactions,
                                                 add_ionic_interactions,
                                                 add_aromatic_sulphur_interactions,
                                                 add_cation_pi_interactions,
                                                 gp.add_hydrophobic_interactions,
                                                 gp.add_vdw_interactions,
                                                 gp.add_backbone_carbonyl_carbonyl_interactions,
                                                 gp.add_salt_bridges]}

# A: Just one-hot encodings
one_hot = {"node_metadata_functions" : [gp.amino_acid_one_hot, gp.meiler_embedding,
                                        partial(gp.expasy_protein_scale, add_separate=True)]}

# B: Selected biochemical features
all_graph_metadata = {"graph_metadata_functions" : [gp.rsa,
                                                    gp.secondary_structure]}
all_node_metadata = {"node_metadata_functions" : [gp.amino_acid_one_hot,
                                                  gp.meiler_embedding,
                                                  partial(gp.expasy_protein_scale, add_separate=True)],
                     #"dssp_config": gp.DSSPConfig()
                     }


config_1A = gp.ProteinGraphConfig(**{**dist_edge_func, **one_hot})
config = config_3B #1A is least memory-intensive
# Get all properties from your NetworkX graph
convertor = GraphFormatConvertor(
    src_format="nx",
    dst_format="pyg",
    columns=[
        # Core structural properties
        "coords", "edge_index",

        # Amino acid identity features
        "amino_acid_one_hot", "residue_name", "residue_number", "chain_id",

        # Embedding vectors
        "meiler",

        # Physical properties
        "bulkiness", "molecularweight", "refractivity", "recognitionfactors",

        # Chemical properties
        "pka_cooh_alpha", "pka_nh3", "pka_rgroup", "isoelectric_points",

        # Polarity measures
        "polarityzimmerman", "polaritygrantham",

        # Hydrophobicity scales (all variants)
        "hphob_eisenberg", "hphob_sweet", "hphob_woods", "hphob_doolittle",
        "hphob_manavalan", "hphob_leo", "hphob_black", "hphob_breese",
        "hphob_fauchere", "hphob_guy", "hphob_janin", "hphob_miyazawa",
        "hphob_argos", "hphob_roseman", "hphob_tanford", "hphob_wolfenden",
        "hphob_welling", "hphob_wilson", "hphob_parker", "hphob_ph3_4",
        "hphob_ph7_5", "hphob_mobility", "hphob_chothia", "hphob_rose",

        # Structure related
        "rsa", "ss", "buriedresidues", "accessibleresidues",
        "averageburied", "averageflexibility",

        # Secondary structure propensities
        "alpha_helixfasman", "beta_sheetfasman", "beta_turnfasman",
        "alpha_helixroux", "beta_sheetroux", "beta_turnroux", "coilroux",
        "alpha_helixlevitt", "beta_sheetlevitt", "beta_turnlevitt",

        # Beta structure details
        "totalbeta_strand", "antiparallelbeta_strand", "parallelbeta_strand",

        # Other properties
        "transmembranetendency", "numbercodons", "ratioside",
        "a_a_composition", "a_a_swiss_prot", "relativemutability",

        # HPLC related
        "hplchfba", "hplctfa", "hplc2_1", "hplc7_4"
    ]
)

# Get paths to all your PDB files
pdb_dir = os.path.expanduser("~/Downloads/nanobody_extracted2")
pdb_paths = [os.path.join(pdb_dir, f) for f in os.listdir(pdb_dir) if f.endswith('.pdb')]

# Create label map (assuming you have a way to determine labels)
# For example, if nanobody in filename means label=1:
label_map = {os.path.splitext(os.path.basename(path))[0]: 1 if "nanobody" in path else 0
             for path in pdb_paths}

# Create the dataset
train_ds = InMemoryProteinGraphDataset(
    root="data/",
    name="train",
    paths=pdb_paths,  # Use paths instead of pdb_codes
    graph_label_map=label_map,
    graphein_config=config_1A,  # Use whichever config you prefer
    graph_format_convertor=convertor,
    graph_transformation_funcs=[],
)

In [None]:
from torch_geometric.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, drop_last=True)


In [None]:
for b in train_ds:
    print(b)
    break

In [None]:
num_proteins = len(train_ds)
print(f"Number of proteins in the dataset: {num_proteins}")

In [None]:
import pandas as pd

def protein_to_dataframe(protein_data):
    """Convert a PyG protein data object to a pandas DataFrame."""
    data_dict = {}

    # Get number of nodes
    num_nodes = protein_data.num_nodes

    # Add basic node indices
    data_dict['node_idx'] = list(range(num_nodes))

    # Add all available node features
    for key in protein_data.keys:
        attr = getattr(protein_data, key)
        if attr is not None and hasattr(attr, 'shape') and attr.shape[0] == num_nodes:
            # Handle different feature shapes
            if len(attr.shape) == 1:  # Single value per node
                data_dict[key] = attr.tolist()
            elif len(attr.shape) == 2:  # Vector per node
                if key == 'amino_acid_one_hot':
                    # Convert one-hot to amino acid type
                    aa_types = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
                                'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL']
                    aa_indices = attr.argmax(dim=1).tolist()
                    data_dict['amino_acid'] = [aa_types[idx] if idx < len(aa_types) else 'UNK' for idx in aa_indices]
                else:
                    # For other vector features, we can take the mean or list them as strings
                    data_dict[key] = [str(attr[i].tolist()) for i in range(num_nodes)]

    # Create DataFrame
    df = pd.DataFrame(data_dict)
    return df

# Convert first protein to DataFrame and display
protein_df = protein_to_dataframe(train_ds[0])
print(protein_df.head(10))  # Show first 10 residues

In [None]:
from graphein.protein.config import ProteinGraphConfig, DSSPConfig
from graphein.protein.graphs import construct_graph
from graphein.protein.edges.distance import (
    add_aromatic_interactions,
    add_disulfide_interactions,
    add_hydrophobic_interactions,
    add_peptide_bonds,
)
from graphein.protein.features.nodes import asa, rsa

config = ProteinGraphConfig(
    edge_construction_functions=[       # List of functions to call to construct edges.
        add_hydrophobic_interactions,
        add_aromatic_interactions,
        add_disulfide_interactions,
        add_peptide_bonds,
    ],
    graph_metadata_functions=[asa, rsa],  # Add ASA and RSA features.
    dssp_config=DSSPConfig(),             # Add DSSP config in order to compute ASA and RSA.
)

g = construct_graph(pdb_code="3eiy", config=config)

In [None]:
from graphein.protein.visualisation import plotly_protein_structure_graph

plotly_protein_structure_graph(g, node_size_multiplier=1)

In [None]:
from graphein.protein.analysis import plot_residue_composition

fig = plot_residue_composition(g, sort_by="count", plot_type="pie") # Can also sort by "alphabetical"
fig.show()

In [None]:
fig = plot_residue_composition(g, sort_by="count", plot_type="bar")
fig.show()

In [None]:
from graphein.protein.analysis import graph_summary

graph_summary(g)

In [None]:
graph_summary(g, plot=True)

In [None]:
from graphein.protein.analysis import plot_degree_distribution

fig = plot_degree_distribution(g)
fig.show()

In [None]:
from graphein.protein.analysis import plot_degree_by_residue_type

fig = plot_degree_by_residue_type(g, normalise_by_residue_occurrence=False)
fig.show()

In [None]:
fig = plot_degree_by_residue_type(g, normalise_by_residue_occurrence=True)
fig.show()

In [None]:
from graphein.protein.analysis import plot_edge_type_distribution

fig = plot_edge_type_distribution(g, plot_type="bar")
fig.show()

In [None]:
from graphein.protein.analysis import plot_edge_type_distribution

fig = plot_edge_type_distribution(g, plot_type="pie")
fig.show()

In [None]:
plot_graph_metric_property_correlation(
    g: nx.Graph,                                # Graph to plot
summary_statistics: List[str] = [           # Graph theoretic metrics to include
"degree",
"betweenness_centrality",
"closeness_centrality",
"eigenvector_centrality",
"communicability_betweenness_centrality",
],
properties: List[str] = ["asa", "rsa"],     # Node features to include
colour_by: Optional[str] = "residue_type",  # How to colour the points
opacity: float = 0.2,                       # Opacity of markers
diagonal_visible: bool = True,              # Whether or not to show the leading diagonal of the plot
title: Optional[str] = None,                # Plot title
height: int = 1000,                         # Plot height
width: int = 1000,                          # Plot width
font_size: int = 10,                        # Font size for axes, title and ticks
)

In [None]:
from graphein.protein.analysis import plot_graph_metric_property_correlation

plot_graph_metric_property_correlation(g, diagonal_visible=False)

In [None]:
plot_graph_metric_property_correlation(g, diagonal_visible=False, colour_by=None)

In [None]:
for _, d in g.nodes(data=True):
    print(d.keys())
    break

In [None]:
from functools import partial
from graphein.protein.features.nodes import expasy_protein_scale

# Construct the graph with the expasy features.
config = ProteinGraphConfig(
    edge_construction_functions=[
        add_hydrophobic_interactions,
        add_aromatic_interactions,
        add_disulfide_interactions,
        add_peptide_bonds,
    ],
    node_metadata_functions=[partial(expasy_protein_scale, add_separate=True)], # Add expasy scale (add partial it so each feature is added under a separate key)
)
g = construct_graph(
    pdb_code="3eiy",
    config=config
)

# Plot
plot_graph_metric_property_correlation(
    g,
    diagonal_visible=False,
    colour_by="residue_type",
    properties=[
        "pka_rgroup",
        "isoelectric_points",
        "bulkiness",
        "transmembranetendency",
        "coilroux",
        "relativemutability"
    ]
)

In [None]:
from graphein.protein.graphs import construct_graph
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import add_hydrogen_bond_interactions, add_ionic_interactions, add_peptide_bonds

# Create backbone graph
config = ProteinGraphConfig()
simple_graph = construct_graph(config, pdb_code="3eiy")

# Create backbone graph with additional interactions
edge_funcs = [add_hydrogen_bond_interactions, add_ionic_interactions, add_peptide_bonds]
config = ProteinGraphConfig(edge_construction_functions= edge_funcs)
complex_graph = construct_graph(config, pdb_code="3eiy")

In [None]:
from graphein.protein.visualisation import plot_distance_matrix

plot_distance_matrix(simple_graph)

In [None]:
contact_map = (simple_graph.graph["dist_mat"] > 10).astype(int) # Threshold distance matrix
plot_distance_matrix(g=simple_graph, dist_mat=contact_map) # Plot contact map

In [None]:
from graphein.protein.visualisation import plot_distance_landscape

plot_distance_landscape(simple_graph)

In [None]:
from graphein.protein.visualisation import plotly_protein_structure_graph

plotly_protein_structure_graph(
    G=simple_graph,
    plot_title="Simple Protein Structure (Backbone only)",
    colour_nodes_by="residue_number",
    colour_edges_by="kind",
    node_size_min=1
)

In [None]:
from graphein.protein.visualisation import plotly_protein_structure_graph

plotly_protein_structure_graph(
    G=complex_graph,
    plot_title="Residue level graph with Hydrogen bonds, ionic interactions and peptide bonds",
    colour_nodes_by="residue_number",
    colour_edges_by="kind",
    node_size_min=20,
    node_size_multiplier=1
)

In [None]:
from graphein.protein.edges.atomic import add_atomic_edges
config = ProteinGraphConfig(granularity='atom', edge_construction_functions=[add_atomic_edges])
g = construct_graph(config, pdb_code="3eiy")

# Create atomic graph
plotly_protein_structure_graph(
    g,
    plot_title="Atom level graph",
    colour_nodes_by="atom_type",
    colour_edges_by="kind",
    node_size_min=10,
    node_size_multiplier=1
)

In [None]:
from graphein.protein.visualisation import plot_protein_structure_graph

p = plot_protein_structure_graph(G=simple_graph, angle=0, colour_edges_by="kind", colour_nodes_by="seq_position", label_node_ids=False)

In [None]:
from graphein.protein.visualisation import asteroid_plot
asteroid_plot(complex_graph, node_id="A:VAL:70", k=4, colour_nodes_by="shell")