# Constructing Subgraphs in Graphein
Graphein provides utilities for extracting various subgraphs. These are composable to enable quite specific subsets.

We first start by constructing a graph with a bunch of different edge types.

In [None]:
# Install Graphein if necessary
# !pip install graphein

In [2]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import *
from graphein.protein.graphs import construct_graph

edge_fns = [
    add_aromatic_interactions,
    add_hydrophobic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    add_disulfide_interactions,
    add_hydrogen_bond_interactions,
    add_ionic_interactions,
    add_peptide_bonds
    ]
config = ProteinGraphConfig(edge_construction_functions=edge_fns)
print(config)

g = construct_graph(config=config, pdb_code="4hhb")

granularity='CA' keep_hets=False insertions=False pdb_dir=PosixPath('../examples/pdbs') verbose=False exclude_waters=True deprotonate=False protein_df_processing_functions=None edge_construction_functions=[<function add_aromatic_interactions at 0x7fdf392c6670>, <function add_hydrophobic_interactions at 0x7fdf392c6430>, <function add_aromatic_sulphur_interactions at 0x7fdf392c6700>, <function add_cation_pi_interactions at 0x7fdf392c6790>, <function add_disulfide_interactions at 0x7fdf392c64c0>, <function add_hydrogen_bond_interactions at 0x7fdf392c6550>, <function add_ionic_interactions at 0x7fdf392c65e0>, <function add_peptide_bonds at 0x7fdf392c63a0>] node_metadata_functions=[<function meiler_embedding at 0x7fdf392c9d30>] edge_metadata_functions=None graph_metadata_functions=None get_contacts_config=None dssp_config=None


DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 574 total nodes
DEBUG:graphein.protein.features.nodes.amino_acid:Reading meiler embeddings from: /Users/arianjamasb/github/graphein/graphein/protein/features/nodes/meiler_embeddings.csv
INFO:graphein.protein.edges.distance:Found: 84 aromatic-aromatic interactions
INFO:graphein.protein.edges.distance:Found 1284 hydrophobic interactions.
INFO:graphein.protein.edges.distance:Found 6 disulfide interactions.
INFO:graphein.protein.edges.distance:Found 208 hbond interactions.
INFO:graphein.protein.edges.distance:Found 12 hbond interactions.
INFO:graphein.protein.edges.distance:Found 420 ionic interactions.


In [2]:
from graphein.protein.visualisation import plotly_protein_structure_graph
plotly_protein_structure_graph(g, node_size_min=4, node_size_multiplier=2)

## Subsetting with a list of nodes
The simplest method of constructing a subgraph is when we already have a defined list of nodes that we wish to extract. The naming convention for nodes is:

`CHAIN:RESIDUE_NAME:POSITION`

e.g: `A:ALA:110`

We can use the `extract_subgraph_from_node_list()` function to achieve this.

```python
extract_subgraph_from_node_list(
    g,
    node_list: Optional[List[str]],
    filter_dataframe: bool = True,
    inverse: bool = False,
    return_node_list: bool = False
)
```

* Selections can be inverted with the `inverse` parameter
* Whether or not we wish to filter the `pdb_df` dataframe associated with the graph (accessed via `g.graph["pdb_df"]`) is controlled by the `filter_dataframe` parameter
* If we just wish to retrieve a list of nodes identified by the selection, instead of returning the subgraph itself we specify this with the `return_node_list` parameter.

This is the core subsetting function. The other subsetting functions described below are based on different methods for computing a list of nodes to subset the graph to. If you wish to implement a subsetting method not described here, you simply need to compute a list of node_ids and provide them to this function.

In [5]:
from graphein.protein.utils import extract_subgraph_from_node_list

NODE_LIST = ['B:LYS:82', 'B:GLY:83', 'B:THR:84', 'B:PHE:85', 'B:ALA:86', 'B:THR:87', 'B:LEU:88', 'B:SER:89', 'B:GLU:90', 'B:LEU:91', 'B:HIS:92', 'B:CYS:93', 'B:ASP:94', 'B:LYS:95', 'B:LEU:96', 'B:HIS:97', 'B:VAL:98', 'B:ASP:99', 'B:PRO:100', 'B:GLU:101', 'B:ASN:102', 'B:PHE:103', 'B:ARG:104', 'B:LEU:105', 'B:LEU:106', 'B:GLY:107', 'B:ASN:108', 'B:VAL:109', 'B:LEU:110', 'B:VAL:111', 'B:CYS:112', 'B:VAL:113', 'B:LEU:114', 'B:ALA:115', 'B:HIS:116', 'B:HIS:117', 'B:PHE:118', 'B:GLY:119', 'B:LYS:120', 'B:GLU:121', 'B:PHE:122', 'B:THR:123', 'B:PRO:124', 'B:PRO:125', 'B:VAL:126', 'B:GLN:127', 'B:ALA:128', 'B:ALA:129', 'B:TYR:130', 'B:GLN:131', 'B:LYS:132', 'B:VAL:133', 'B:VAL:134', 'B:ALA:135', 'B:GLY:136', 'B:VAL:137', 'B:ALA:138', 'B:ASN:139', 'B:ALA:140', 'B:LEU:141', 'B:ALA:142', 'B:HIS:143', 'B:LYS:144', 'B:TYR:145', 'B:HIS:146', 'C:VAL:1', 'C:LEU:2', 'C:SER:3', 'C:PRO:4', 'C:ALA:5', 'C:ASP:6', 'C:LYS:7', 'C:THR:8', 'C:ASN:9', 'C:VAL:10', 'C:LYS:11', 'C:ALA:12', 'C:ALA:13', 'C:TRP:14', 'C:GLY:15', 'C:LYS:16', 'C:VAL:17', 'C:GLY:18', 'C:ALA:19', 'C:HIS:20', 'C:ALA:21', 'C:GLY:22', 'C:GLU:23', 'C:TYR:24', 'C:GLY:25', 'C:ALA:26', 'C:GLU:27', 'C:ALA:28', 'C:LEU:29', 'C:GLU:30', 'C:ARG:31', 'C:MET:32', 'C:PHE:33', 'C:LEU:34', 'C:SER:35', 'C:PHE:36', 'C:PRO:37', 'C:THR:38', 'C:THR:39', 'C:LYS:40', 'C:THR:41', 'C:TYR:42', 'C:PHE:43', 'C:PRO:44', 'C:HIS:45', 'C:PHE:46', 'C:ASP:47', 'C:LEU:48', 'C:SER:49', 'C:HIS:50', 'C:GLY:51', 'C:SER:52', 'C:ALA:53', 'C:GLN:54', 'C:VAL:55', 'C:LYS:56', 'C:GLY:57', 'C:HIS:58', 'C:GLY:59', 'C:LYS:60', 'C:LYS:61', 'C:VAL:62', 'C:ALA:63', 'C:ASP:64', 'C:ALA:65', 'C:LEU:66', 'C:THR:67', 'C:ASN:68', 'C:ALA:69', 'C:VAL:70', 'C:ALA:71']

s_g = extract_subgraph_from_node_list(
    g,
    NODE_LIST
    )

for n in s_g.nodes():
    assert n in NODE_LIST

for n in NODE_LIST:
    assert n in g.nodes()

plotly_protein_structure_graph(s_g, node_size_min=4, node_size_multiplier=2)

DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['B:LYS:82', 'B:GLY:83', 'B:THR:84', 'B:PHE:85', 'B:ALA:86', 'B:THR:87', 'B:LEU:88', 'B:SER:89', 'B:GLU:90', 'B:LEU:91', 'B:HIS:92', 'B:CYS:93', 'B:ASP:94', 'B:LYS:95', 'B:LEU:96', 'B:HIS:97', 'B:VAL:98', 'B:ASP:99', 'B:PRO:100', 'B:GLU:101', 'B:ASN:102', 'B:PHE:103', 'B:ARG:104', 'B:LEU:105', 'B:LEU:106', 'B:GLY:107', 'B:ASN:108', 'B:VAL:109', 'B:LEU:110', 'B:VAL:111', 'B:CYS:112', 'B:VAL:113', 'B:LEU:114', 'B:ALA:115', 'B:HIS:116', 'B:HIS:117', 'B:PHE:118', 'B:GLY:119', 'B:LYS:120', 'B:GLU:121', 'B:PHE:122', 'B:THR:123', 'B:PRO:124', 'B:PRO:125', 'B:VAL:126', 'B:GLN:127', 'B:ALA:128', 'B:ALA:129', 'B:TYR:130', 'B:GLN:131', 'B:LYS:132', 'B:VAL:133', 'B:VAL:134', 'B:ALA:135', 'B:GLY:136', 'B:VAL:137', 'B:ALA:138', 'B:ASN:139', 'B:ALA:140', 'B:LEU:141', 'B:ALA:142', 'B:HIS:143', 'B:LYS:144', 'B:TYR:145', 'B:HIS:146', 'C:VAL:1', 'C:LEU:2', 'C:SER:3', 'C:PRO:4', 'C:ALA:5', 'C:ASP:6', 'C:LYS:7', 'C:THR:8', 'C:ASN:9', 'C:VAL:10', 'C

In [7]:
# The associated dataframe is filtered to only include the remaining nodes by default.
# If this is not desired, set filter_dataframe=False
s_g.graph["pdb_df"]

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx,node_id
222,ATOM,1689,,CA,,LYS,,B,82,,...,-20.862,8.452,1.0,24.25,,,C,,2572,B:LYS:82
223,ATOM,1698,,CA,,GLY,,B,83,,...,-23.724,10.746,1.0,41.64,,,C,,2581,B:GLY:83
224,ATOM,1702,,CA,,THR,,B,84,,...,-22.242,11.744,1.0,25.47,,,C,,2585,B:THR:84
225,ATOM,1709,,CA,,PHE,,B,85,,...,-18.963,12.749,1.0,21.59,,,C,,2592,B:PHE:85
226,ATOM,1720,,CA,,ALA,,B,86,,...,-20.242,13.948,1.0,23.14,,,C,,2603,B:ALA:86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,ATOM,2694,,CA,,THR,,C,67,,...,16.119,9.983,1.0,15.27,,,C,,3577,C:THR:67
354,ATOM,2701,,CA,,ASN,,C,68,,...,18.613,11.088,1.0,21.49,,,C,,3584,C:ASN:68
355,ATOM,2709,,CA,,ALA,,C,69,,...,17.929,8.006,1.0,15.27,,,C,,3592,C:ALA:69
356,ATOM,2714,,CA,,VAL,,C,70,,...,18.432,5.673,1.0,21.72,,,C,,3597,C:VAL:70


In [27]:
# Inversing the selection.
s_g = extract_subgraph_from_node_list(
    g,
    NODE_LIST,
    inverse=True
    )
plotly_protein_structure_graph(s_g, node_size_min=4, node_size_multiplier=2)

DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['B:LYS:82', 'B:GLY:83', 'B:THR:84', 'B:PHE:85', 'B:ALA:86', 'B:THR:87', 'B:LEU:88', 'B:SER:89', 'B:GLU:90', 'B:LEU:91', 'B:HIS:92', 'B:CYS:93', 'B:ASP:94', 'B:LYS:95', 'B:LEU:96', 'B:HIS:97', 'B:VAL:98', 'B:ASP:99', 'B:PRO:100', 'B:GLU:101', 'B:ASN:102', 'B:PHE:103', 'B:ARG:104', 'B:LEU:105', 'B:LEU:106', 'B:GLY:107', 'B:ASN:108', 'B:VAL:109', 'B:LEU:110', 'B:VAL:111', 'B:CYS:112', 'B:VAL:113', 'B:LEU:114', 'B:ALA:115', 'B:HIS:116', 'B:HIS:117', 'B:PHE:118', 'B:GLY:119', 'B:LYS:120', 'B:GLU:121', 'B:PHE:122', 'B:THR:123', 'B:PRO:124', 'B:PRO:125', 'B:VAL:126', 'B:GLN:127', 'B:ALA:128', 'B:ALA:129', 'B:TYR:130', 'B:GLN:131', 'B:LYS:132', 'B:VAL:133', 'B:VAL:134', 'B:ALA:135', 'B:GLY:136', 'B:VAL:137', 'B:ALA:138', 'B:ASN:139', 'B:ALA:140', 'B:LEU:141', 'B:ALA:142', 'B:HIS:143', 'B:LYS:144', 'B:TYR:145', 'B:HIS:146', 'C:VAL:1', 'C:LEU:2', 'C:SER:3', 'C:PRO:4', 'C:ALA:5', 'C:ASP:6', 'C:LYS:7', 'C:THR:8', 'C:ASN:9', 'C:VAL:10', 'C

### Spatial Subgraphing

We can construct spatial subgraphs by specifying a central point and a radius. All nodes within that radius (euclidean distance) will be selected. This selection can be inversed as before.

Here we select all nodes within 20 $\mathring A$ of the origin:

*N.B. different proteins may use different co-ordinate spaces*

In [8]:
from graphein.protein.utils import extract_subgraph_from_point

s_g = extract_subgraph_from_point(g, centre_point=(0, 0, 0), radius=20)

plotly_protein_structure_graph(s_g, node_size_min=4, node_size_multiplier=2)

DEBUG:graphein.protein.utils:Found 177 nodes in the spatial point-radius subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['C:ASN:97', 'C:ALA:123', 'A:LEU:136', 'B:VAL:111', 'C:HIS:103', 'C:LEU:91', 'C:LEU:29', 'D:LEU:31', 'C:PHE:36', 'A:TYR:42', 'D:LEU:32', 'A:SER:35', 'D:VAL:137', 'C:PHE:128', 'C:PHE:98', 'A:SER:133', 'B:PRO:100', 'B:LEU:110', 'D:ALA:135', 'D:TYR:145', 'C:VAL:93', 'A:VAL:93', 'C:PRO:95', 'A:LYS:127', 'C:ARG:141', 'A:LEU:29', 'D:VAL:109', 'B:PHE:103', 'D:HIS:143', 'A:PHE:128', 'A:VAL:96', 'A:ARG:92', 'B:TYR:35', 'A:ALA:123', 'C:PHE:33', 'A:TYR:140', 'A:ASP:94', 'D:LEU:141', 'A:LEU:129', 'C:ALA:28', 'C:LYS:40', 'D:TRP:37', 'D:GLN:131', 'B:LYS:132', 'D:ALA:142', 'A:ARG:141', 'B:ALA:140', 'D:VAL:111', 'A:ALA:88', 'D:LEU:105', 'B:LEU:32', 'D:VAL:134', 'B:VAL:109', 'D:ASP:99', 'A:LYS:99', 'A:HIS:103', 'B:GLY:136', 'C:THR:137', 'B:GLU:101', 'C:VAL:1', 'D:VAL:34', 'C:SER:102', 'A:LYS:139', 'C:SER:131', 'B:ASP:99', 'C:PRO:37', 'C:LYS:99', 'A:LEU:100', 'C:A

In [9]:
# Again, we can inverse this selection
s_g = extract_subgraph_from_point(g, centre_point=(0, 0, 0), radius=20, inverse=True)
plotly_protein_structure_graph(s_g, node_size_min=4, node_size_multiplier=2)

DEBUG:graphein.protein.utils:Found 177 nodes in the spatial point-radius subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['C:ASN:97', 'C:ALA:123', 'A:LEU:136', 'B:VAL:111', 'C:HIS:103', 'C:LEU:91', 'C:LEU:29', 'D:LEU:31', 'C:PHE:36', 'A:TYR:42', 'D:LEU:32', 'A:SER:35', 'D:VAL:137', 'C:PHE:128', 'C:PHE:98', 'A:SER:133', 'B:PRO:100', 'B:LEU:110', 'D:ALA:135', 'D:TYR:145', 'C:VAL:93', 'A:VAL:93', 'C:PRO:95', 'A:LYS:127', 'C:ARG:141', 'A:LEU:29', 'D:VAL:109', 'B:PHE:103', 'D:HIS:143', 'A:PHE:128', 'A:VAL:96', 'A:ARG:92', 'B:TYR:35', 'A:ALA:123', 'C:PHE:33', 'A:TYR:140', 'A:ASP:94', 'D:LEU:141', 'A:LEU:129', 'C:ALA:28', 'C:LYS:40', 'D:TRP:37', 'D:GLN:131', 'B:LYS:132', 'D:ALA:142', 'A:ARG:141', 'B:ALA:140', 'D:VAL:111', 'A:ALA:88', 'D:LEU:105', 'B:LEU:32', 'D:VAL:134', 'B:VAL:109', 'D:ASP:99', 'A:LYS:99', 'A:HIS:103', 'B:GLY:136', 'C:THR:137', 'B:GLU:101', 'C:VAL:1', 'D:VAL:34', 'C:SER:102', 'A:LYS:139', 'C:SER:131', 'B:ASP:99', 'C:PRO:37', 'C:LYS:99', 'A:LEU:100', 'C:A

## Subgraphing based on Residue Types


In [10]:
from graphein.protein.utils import extract_subgraph_from_residue_types
residue_types = ["SER", "ALA", "GLY"]

s_g = extract_subgraph_from_residue_types(g, residue_types)
plotly_protein_structure_graph(s_g, colour_nodes_by="residue_name")

DEBUG:graphein.protein.utils:Found 144 nodes in the residue type subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['C:ALA:123', 'B:GLY:25', 'A:ALA:120', 'A:SER:35', 'D:ALA:135', 'A:SER:133', 'D:ALA:76', 'D:GLY:119', 'D:GLY:83', 'D:GLY:64', 'C:GLY:25', 'A:ALA:79', 'B:SER:72', 'A:ALA:82', 'A:ALA:123', 'D:GLY:46', 'D:ALA:62', 'B:GLY:24', 'C:ALA:28', 'D:SER:49', 'D:ALA:129', 'D:ALA:142', 'B:ALA:140', 'A:ALA:5', 'C:ALA:26', 'A:ALA:88', 'A:SER:52', 'B:ALA:27', 'C:ALA:19', 'A:ALA:115', 'B:GLY:74', 'D:GLY:25', 'B:GLY:136', 'A:ALA:63', 'B:GLY:46', 'C:GLY:18', 'B:GLY:119', 'C:SER:102', 'D:SER:44', 'C:SER:131', 'D:SER:9', 'C:ALA:63', 'C:ALA:5', 'A:GLY:57', 'C:ALA:88', 'B:ALA:115', 'C:SER:81', 'D:ALA:128', 'A:ALA:21', 'A:ALA:19', 'B:ALA:142', 'A:ALA:111', 'D:ALA:138', 'A:ALA:53', 'B:SER:9', 'B:SER:49', 'B:GLY:83', 'C:SER:124', 'B:SER:44', 'B:ALA:129', 'C:ALA:82', 'C:ALA:110', 'A:GLY:25', 'D:SER:72', 'D:GLY:56', 'A:SER:3', 'A:GLY:59', 'B:ALA:76', 'C:ALA:53', 'B:ALA:53', 'B:GLY:5

In [11]:
# Inverse the selection
s_g = extract_subgraph_from_residue_types(g, residue_types, inverse=True)
plotly_protein_structure_graph(s_g, colour_nodes_by="residue_name", node_size_min=4, node_size_multiplier=2)

DEBUG:graphein.protein.utils:Found 144 nodes in the residue type subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['C:ALA:123', 'B:GLY:25', 'A:ALA:120', 'A:SER:35', 'D:ALA:135', 'A:SER:133', 'D:ALA:76', 'D:GLY:119', 'D:GLY:83', 'D:GLY:64', 'C:GLY:25', 'A:ALA:79', 'B:SER:72', 'A:ALA:82', 'A:ALA:123', 'D:GLY:46', 'D:ALA:62', 'B:GLY:24', 'C:ALA:28', 'D:SER:49', 'D:ALA:129', 'D:ALA:142', 'B:ALA:140', 'A:ALA:5', 'C:ALA:26', 'A:ALA:88', 'A:SER:52', 'B:ALA:27', 'C:ALA:19', 'A:ALA:115', 'B:GLY:74', 'D:GLY:25', 'B:GLY:136', 'A:ALA:63', 'B:GLY:46', 'C:GLY:18', 'B:GLY:119', 'C:SER:102', 'D:SER:44', 'C:SER:131', 'D:SER:9', 'C:ALA:63', 'C:ALA:5', 'A:GLY:57', 'C:ALA:88', 'B:ALA:115', 'C:SER:81', 'D:ALA:128', 'A:ALA:21', 'A:ALA:19', 'B:ALA:142', 'A:ALA:111', 'D:ALA:138', 'A:ALA:53', 'B:SER:9', 'B:SER:49', 'B:GLY:83', 'C:SER:124', 'B:SER:44', 'B:ALA:129', 'C:ALA:82', 'C:ALA:110', 'A:GLY:25', 'D:SER:72', 'D:GLY:56', 'A:SER:3', 'A:GLY:59', 'B:ALA:76', 'C:ALA:53', 'B:ALA:53', 'B:GLY:5

## Subgraphing based on Chains
We can extract graphs of individual chains in a complexed structure graph.

First, let's recap what original protein looks like when coloured by chain:

In [12]:
plotly_protein_structure_graph(g, colour_nodes_by="chain_id", node_size_min=20, node_size_multiplier=1)

And now we extract the subgraph:

In [13]:
from graphein.protein.utils import extract_subgraph_from_chains

s_g = extract_subgraph_from_chains(g, ["A", "B"])
plotly_protein_structure_graph(s_g, colour_nodes_by="chain_id", node_size_min=20, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 287 nodes in the chain subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['B:LYS:95', 'B:LEU:81', 'A:LEU:136', 'A:PHE:46', 'A:PHE:117', 'B:VAL:23', 'A:LEU:80', 'B:GLY:25', 'B:ASN:57', 'B:VAL:111', 'B:ARG:30', 'A:PRO:44', 'A:ALA:120', 'B:VAL:20', 'B:PHE:118', 'A:TYR:42', 'B:ASP:47', 'A:SER:35', 'B:LEU:78', 'A:THR:67', 'B:THR:84', 'B:ASP:79', 'A:SER:133', 'B:LYS:17', 'A:LEU:66', 'B:PRO:100', 'B:LEU:110', 'A:PRO:4', 'B:ASN:80', 'B:ALA:128', 'A:HIS:122', 'A:VAL:93', 'A:LYS:127', 'A:LEU:29', 'A:ALA:79', 'B:PRO:51', 'B:PHE:103', 'A:PHE:128', 'A:VAL:96', 'A:ARG:92', 'B:LEU:14', 'A:PHE:43', 'B:SER:72', 'B:HIS:63', 'A:HIS:50', 'A:ALA:82', 'A:ALA:123', 'B:HIS:2', 'A:TYR:140', 'B:TYR:35', 'B:HIS:77', 'B:MET:55', 'B:LEU:3', 'B:GLY:136', 'A:ASP:94', 'A:VAL:62', 'A:LEU:129', 'B:GLY:24', 'B:LEU:28', 'B:GLU:43', 'B:GLU:6', 'B:LYS:59', 'A:THR:8', 'A:HIS:45', 'B:HIS:92', 'B:LYS:132', 'A:LEU:113', 'A:ARG:141', 'B:ALA:140', 'A:ALA:5', 'B:VAL:1', 'A:ALA

## Subgraphing based on sequence positions
We extract subgraphs based on their position in the sequence with `extract_subgraph_by_sequence_position()`:

*N.B. this does not discriminate based on chain. If you wish to do so, either use the base node_list subsetting function or compose the chain selection and the sequence position selection functions*

In [6]:
from graphein.protein.utils import extract_subgraph_by_sequence_position

SEQUENCE_POSITIONS = range(1, 100, 2)

s_g = extract_subgraph_by_sequence_position(g, SEQUENCE_POSITIONS)
plotly_protein_structure_graph(s_g, colour_nodes_by="residue_number", node_size_min=20, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 200 nodes in the sequence position subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['A:THR:67', 'D:LYS:95', 'D:ASP:99', 'A:PRO:37', 'A:GLY:59', 'C:LYS:11', 'C:ASN:97', 'C:THR:39', 'B:ASN:57', 'A:ASN:9', 'A:SER:3', 'B:MET:55', 'C:PRO:37', 'D:GLU:43', 'C:ASN:9', 'D:VAL:1', 'C:HIS:87', 'C:PRO:77', 'D:SER:9', 'C:ASP:75', 'A:VAL:93', 'C:ASP:85', 'A:ALA:71', 'A:HIS:45', 'B:LYS:17', 'B:TRP:37', 'D:HIS:97', 'C:LYS:7', 'B:HIS:63', 'C:PRO:95', 'C:ALA:53', 'C:ALA:13', 'B:LEU:31', 'A:GLY:25', 'B:GLY:29', 'D:ALA:13', 'C:GLY:51', 'C:SER:81', 'A:HIS:87', 'C:THR:41', 'B:LYS:95', 'A:SER:49', 'A:ASP:75', 'B:ALA:13', 'A:ALA:13', 'D:ASP:73', 'A:LYS:7', 'B:GLU:7', 'C:SER:49', 'B:SER:9', 'D:LYS:17', 'C:THR:67', 'D:ALA:53', 'D:VAL:11', 'D:PHE:45', 'C:GLY:15', 'B:VAL:23', 'B:ASP:21', 'B:PHE:71', 'A:LYS:99', 'A:ASN:97', 'A:GLY:15', 'B:PRO:5', 'A:LEU:29', 'D:ASP:79', 'A:LYS:11', 'D:GLY:25', 'D:GLY:29', 'B:SER:89', 'C:GLU:27', 'A:THR:39', 'C:ALA:79', 'D:LEU

## Subgraphs based on bond types
We can subset graphs to nodes that share certain bond types using `extract_subgraph_by_bond_type()`

In [15]:
from graphein.protein.utils import extract_subgraph_by_bond_type

BOND_TYPES = ["hbond", "ionic"]

s_g = extract_subgraph_by_bond_type(g, BOND_TYPES)
plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 121 nodes in the bond type subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['B:ARG:30', 'A:TYR:42', 'B:ASP:47', 'C:HIS:50', 'C:HIS:20', 'A:SER:133', 'B:LYS:17', 'A:HIS:122', 'D:ASP:47', 'A:LYS:127', 'C:ARG:141', 'D:HIS:2', 'A:ARG:92', 'C:LYS:11', 'D:ARG:30', 'A:HIS:50', 'B:TYR:35', 'B:HIS:2', 'B:HIS:77', 'C:LYS:7', 'C:GLU:30', 'D:GLU:90', 'B:GLU:43', 'D:SER:49', 'D:HIS:116', 'B:LYS:132', 'A:ARG:141', 'C:HIS:89', 'D:LYS:65', 'D:ASP:99', 'C:HIS:122', 'B:GLU:101', 'A:LYS:139', 'B:ASP:99', 'D:GLU:121', 'C:SER:81', 'B:HIS:116', 'B:GLU:26', 'A:ASP:74', 'B:SER:49', 'B:LYS:144', 'C:SER:124', 'B:GLU:22', 'C:ARG:31', 'D:HIS:117', 'D:HIS:77', 'D:ARG:104', 'B:ASP:94', 'A:ASP:64', 'C:ARG:92', 'A:SER:3', 'C:ASP:6', 'D:GLU:101', 'D:HIS:97', 'D:ASP:73', 'C:GLU:116', 'A:ASN:78', 'A:LYS:16', 'C:SER:49', 'A:GLU:27', 'C:LYS:139', 'D:LYS:144', 'B:LYS:65', 'A:ASP:126', 'C:ASP:75', 'A:GLU:116', 'A:LYS:7', 'A:LYS:60', 'A:HIS:20', 'D:GLU:22', 'A:ASP:6', '

## K-hop subgraphs
We can extract subgraphs based on the set of nodes that are within $k$ hops of a central node using `extract_k_hop_subgraph`:

In [16]:
from graphein.protein.utils import extract_k_hop_subgraph

# K = 1
s_g = extract_k_hop_subgraph(g, central_node="A:ALA:110", k=1)
plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 5 nodes in the k-hop subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['A:ALA:111', 'A:PHE:117', 'B:ALA:115', 'A:ALA:110', 'A:LEU:109'].


In [17]:
# K =2
s_g = extract_k_hop_subgraph(g, central_node="A:ALA:110", k=2)
plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 21 nodes in the k-hop subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['A:THR:108', 'A:PHE:117', 'A:VAL:17', 'B:PHE:122', 'A:TYR:24', 'B:ALA:115', 'A:TRP:14', 'A:HIS:122', 'A:LEU:106', 'B:LEU:114', 'B:HIS:116', 'A:GLU:116', 'A:ALA:111', 'A:LEU:125', 'A:HIS:112', 'A:VAL:121', 'A:THR:118', 'A:VAL:107', 'A:ALA:110', 'A:LEU:113', 'A:LEU:109'].


In [18]:
K = 3
s_g = extract_k_hop_subgraph(g, central_node="A:ALA:110", k=3)
plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 57 nodes in the k-hop subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['B:THR:123', 'A:THR:108', 'B:TYR:130', 'A:ALA:115', 'A:PHE:117', 'A:VAL:17', 'B:VAL:23', 'B:VAL:111', 'A:LYS:16', 'B:VAL:126', 'A:LEU:105', 'A:ALA:120', 'A:ALA:63', 'B:PHE:118', 'A:PRO:114', 'A:GLU:27', 'A:ALA:13', 'A:SER:124', 'B:PHE:122', 'B:LEU:110', 'A:LEU:66', 'A:TYR:24', 'B:ALA:115', 'A:TRP:14', 'A:HIS:122', 'A:LEU:106', 'A:ASP:126', 'A:ALA:21', 'B:LEU:114', 'B:HIS:116', 'A:GLU:116', 'A:VAL:10', 'B:VAL:113', 'B:GLU:26', 'A:GLY:15', 'A:ALA:111', 'A:LEU:125', 'A:HIS:20', 'A:HIS:112', 'A:GLU:23', 'A:VAL:121', 'A:THR:118', 'A:VAL:107', 'B:LEU:14', 'A:VAL:70', 'B:GLU:121', 'A:ALA:123', 'B:VAL:18', 'A:GLY:25', 'A:LEU:129', 'A:GLY:18', 'B:HIS:117', 'A:ALA:110', 'A:PRO:119', 'A:LEU:113', 'A:LEU:109', 'B:TRP:15'].


In [19]:
# K= 4
s_g = extract_k_hop_subgraph(g, central_node="A:ALA:110", k=4)
plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 109 nodes in the k-hop subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['A:PHE:117', 'B:VAL:23', 'B:VAL:111', 'B:GLY:25', 'B:ARG:30', 'B:VAL:20', 'A:ALA:120', 'B:PHE:118', 'A:THR:67', 'B:LYS:17', 'B:LEU:110', 'A:LEU:66', 'A:HIS:122', 'A:LYS:127', 'C:ARG:141', 'B:PRO:51', 'A:PHE:128', 'B:LEU:14', 'B:TYR:35', 'A:ALA:123', 'B:MET:55', 'B:GLY:24', 'A:LEU:129', 'A:VAL:62', 'A:LEU:113', 'B:ALA:27', 'B:GLN:127', 'A:ALA:115', 'B:VAL:109', 'A:ALA:63', 'A:PRO:114', 'B:PHE:71', 'B:LEU:68', 'B:GLY:119', 'B:ALA:115', 'A:ALA:21', 'B:HIS:116', 'B:GLU:26', 'A:ALA:19', 'A:ALA:111', 'A:LEU:125', 'B:GLU:22', 'A:VAL:107', 'B:ALA:129', 'B:PRO:124', 'B:VAL:18', 'A:GLY:25', 'A:ASP:64', 'B:VAL:11', 'B:THR:123', 'A:VAL:17', 'A:LYS:16', 'A:ALA:12', 'A:GLU:27', 'B:PHE:122', 'A:ASP:126', 'B:PRO:125', 'A:VAL:10', 'B:LEU:114', 'A:GLU:116', 'B:VAL:113', 'B:GLN:131', 'B:LYS:120', 'A:GLY:15', 'A:ALA:28', 'A:HIS:20', 'A:HIS:112', 'A:ASP:6', 'A:ALA:130', 'A:GLU:23'

In [20]:
# Again, these can be inversed:
s_g = extract_k_hop_subgraph(g, central_node="A:ALA:110", k=4, inverse=True)
plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 109 nodes in the k-hop subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['A:PHE:117', 'B:VAL:23', 'B:VAL:111', 'B:GLY:25', 'B:ARG:30', 'B:VAL:20', 'A:ALA:120', 'B:PHE:118', 'A:THR:67', 'B:LYS:17', 'B:LEU:110', 'A:LEU:66', 'A:HIS:122', 'A:LYS:127', 'C:ARG:141', 'B:PRO:51', 'A:PHE:128', 'B:LEU:14', 'B:TYR:35', 'A:ALA:123', 'B:MET:55', 'B:GLY:24', 'A:LEU:129', 'A:VAL:62', 'A:LEU:113', 'B:ALA:27', 'B:GLN:127', 'A:ALA:115', 'B:VAL:109', 'A:ALA:63', 'A:PRO:114', 'B:PHE:71', 'B:LEU:68', 'B:GLY:119', 'B:ALA:115', 'A:ALA:21', 'B:HIS:116', 'B:GLU:26', 'A:ALA:19', 'A:ALA:111', 'A:LEU:125', 'B:GLU:22', 'A:VAL:107', 'B:ALA:129', 'B:PRO:124', 'B:VAL:18', 'A:GLY:25', 'A:ASP:64', 'B:VAL:11', 'B:THR:123', 'A:VAL:17', 'A:LYS:16', 'A:ALA:12', 'A:GLU:27', 'B:PHE:122', 'A:ASP:126', 'B:PRO:125', 'A:VAL:10', 'B:LEU:114', 'A:GLU:116', 'B:VAL:113', 'B:GLN:131', 'B:LYS:120', 'A:GLY:15', 'A:ALA:28', 'A:HIS:20', 'A:HIS:112', 'A:ASP:6', 'A:ALA:130', 'A:GLU:23'

## Subgraphing based on Atom Types
This can be achieved with `extract_subgraph_from_atom_types()`. This is not relevant for resiude-level graphs as we use (typically) C$\alpha$ atoms as the nodes. Instead, we create an atom-level graph for this example.

In [21]:
from graphein.protein.edges.atomic import add_atomic_edges
config=ProteinGraphConfig(granularity="atom", edge_construction_functions=[add_atomic_edges])
g = construct_graph(config=config, pdb_code="4hhb")
plotly_protein_structure_graph(g, node_size_min=5, node_size_multiplier=1, colour_nodes_by="atom_type")

DEBUG:graphein.protein.graphs:Deprotonating protein. This removes H atoms from the pdb_df dataframe
DEBUG:graphein.protein.graphs:Detected 4384 total nodes


In [22]:
from graphein.protein.utils import extract_subgraph_from_atom_types

ATOM_TYPES = ["CA", "N"]

s_g = extract_subgraph_from_atom_types(g, ATOM_TYPES)
plotly_protein_structure_graph(s_g, colour_nodes_by="atom_type", node_size_min=5, node_size_multiplier=1)

DEBUG:graphein.protein.utils:Found 1148 nodes in the atom type subgraph.
DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['D:LEU:88:CA', 'C:VAL:93:N', 'C:ALA:130:N', 'C:ALA:111:CA', 'C:HIS:87:N', 'B:ALA:86:N', 'D:ALA:140:N', 'B:ALA:10:N', 'D:ASN:102:N', 'A:PRO:4:CA', 'A:ASN:68:CA', 'C:LYS:11:N', 'B:VAL:137:N', 'C:VAL:121:N', 'A:ASN:97:CA', 'A:SER:49:CA', 'C:THR:8:CA', 'D:HIS:92:N', 'B:TYR:130:N', 'C:ALA:63:CA', 'A:LEU:83:N', 'A:ASP:47:N', 'C:LEU:125:N', 'C:LEU:129:N', 'D:LEU:75:CA', 'A:THR:41:CA', 'D:SER:89:N', 'B:SER:72:N', 'C:PRO:4:N', 'A:LEU:34:N', 'C:THR:38:N', 'D:THR:12:N', 'A:LEU:125:CA', 'B:VAL:113:CA', 'B:VAL:1:N', 'C:SER:3:N', 'D:GLY:24:CA', 'D:GLY:29:CA', 'D:SER:9:CA', 'D:PRO:58:CA', 'C:HIS:20:N', 'C:LYS:16:N', 'C:SER:138:N', 'B:LEU:88:N', 'D:LEU:106:CA', 'B:ALA:27:N', 'A:ASP:47:CA', 'B:CYS:93:CA', 'A:HIS:20:CA', 'C:VAL:1:N', 'C:ASP:94:CA', 'B:HIS:2:CA', 'B:LYS:8:N', 'B:PHE:71:N', 'A:LEU:91:N', 'B:PRO:100:CA', 'C:LEU:66:N', 'A:THR:67:CA', 'B:VAL:11:N', 'D:SER:9:N',

## High-level function
We also provide a higher level function to combine multiple selections which wraps all of the aforementioned functions. All of the selections described previously can be performed with the `extract_subgraph` function:

```python
extract_subgraph(
    g: nx.Graph,
    node_list: Optional[List[str]] = None,
    sequence_positions: Optional[List[str]] = None,
    chains: Optional[List[str]] = None,
    residue_types: Optional[List[str]] = None,
    atom_types: Optional[List[str]] = None,
    bond_types: Optional[List[str]] = None,
    centre_point: Optional[
        Union[np.ndarray, Tuple[float, float, float]]
    ] = None,
    radius: Optional[float] = None,
    k_hop_central_node: Optional[str] = None,
    k_hops: Optional[int] = None,
    k_only: Optional[bool] = None,
    filter_dataframe: bool = True,
    inverse: bool = False,
    return_node_list: bool = False,
) -> Union[nx.Graph, List[str]]:
```


In [23]:
from graphein.protein.utils import extract_subgraph
## Node list selection
s_g = extract_subgraph(g, node_list=NODE_LIST, inverse=False)

# Sequence position selection
s_g = extract_subgraph(g, sequence_positions=SEQUENCE_POSITIONS, inverse=False)

# chain selection
s_g = extract_subgraph(g, chains=["A", "B"], inverse=False)

# Performing selections with multiple methods

s_g = extract_subgraph(g, node_list=NODE_LIST, chains = ["A"], inverse=False)

plotly_protein_structure_graph(s_g, node_size_min=10, node_size_multiplier=1)


DEBUG:graphein.protein.utils:Creating subgraph from nodes: ['B:LYS:95', 'C:LYS:56', 'B:VAL:111', 'C:LEU:29', 'C:PHE:36', 'B:PHE:118', 'C:HIS:50', 'B:THR:84', 'C:PRO:4', 'C:HIS:20', 'B:PRO:100', 'B:LEU:110', 'C:PHE:46', 'C:VAL:10', 'C:GLY:25', 'B:PHE:103', 'C:LYS:11', 'C:VAL:70', 'C:PHE:33', 'C:LYS:7', 'C:GLU:30', 'C:ALA:28', 'C:LYS:40', 'B:HIS:92', 'B:LYS:132', 'B:ALA:140', 'C:LEU:2', 'C:ALA:26', 'C:ALA:19', 'B:GLN:127', 'B:VAL:109', 'B:GLY:136', 'C:GLY:18', 'C:VAL:55', 'B:GLU:101', 'C:VAL:1', 'B:GLY:119', 'B:ASP:99', 'C:PRO:37', 'C:ALA:63', 'C:ALA:5', 'C:LEU:48', 'B:ALA:115', 'B:VAL:137', 'B:HIS:116', 'B:ALA:142', 'C:LEU:66', 'B:GLY:83', 'B:LYS:144', 'C:ARG:31', 'B:PRO:124', 'B:ALA:129', 'C:LEU:34', 'C:HIS:58', 'C:MET:32', 'B:TYR:145', 'B:ASP:94', 'C:ASP:6', 'C:ALA:53', 'B:THR:123', 'B:LEU:105', 'C:ALA:69', 'C:VAL:62', 'C:SER:49', 'C:ALA:12', 'C:HIS:45', 'C:GLN:54', 'C:ALA:65', 'B:PHE:122', 'B:GLY:107', 'C:GLY:57', 'B:LEU:91', 'B:LYS:82', 'C:THR:8', 'B:PRO:125', 'B:LEU:114', 'B:VAL:11