# Imports

The **services module** contains functions for accessing third party bioemdical data services.

In [None]:
from services.pdb import file, molecule, ligand, pfam, go_terms
from services.uniprot import pdb2uniprot

##### PDB
For PDB we have functions for quering information about (1) PDB files, (2) Protein Molecules, (3) Ligands, (4) PFam Annotations and (5) GO annotations.  

##### Uniprot
For Uniprot we query a service that maps pdb IDs to uniprot IDs. 

# Pocket Feature KG Set Up

In [6]:
import pandas as pd
import networkx as nx
import numpy as np

##### Step 1. Convert pocket feature scores to "cosine similarities"

Given a matrix $X \in \mathbb R ^n$ with rows $x_i$, we compute the cosine similarity between two rows as:

$$ \frac{x_i^T x_j}{\lVert x_i \rVert \lVert x_j \rVert }$$

Given symmetric positive semidefinite matrix $A = X X^T$, we can compute cosine similarities using hte formula:

$$\frac{a_{ij}}{\sqrt{a_{ii}}\sqrt{a_{jj}}} = \frac{x_i^T x_j}{\lVert x_i \rVert \lVert x_j \rVert }$$ 

In [None]:
# Read in pocket feature similarities and convert to an adjacency matrix
pf = pd.read_csv('./data/pocket_feature_scores.csv', header=None, names=['pocket_0', 'pocket_1', 'weight'])
PFG = nx.from_pandas_edgelist(pf, source='pocket_0', target='pocket_1', edge_attr='weight')
pf_matrix = nx.to_pandas_adjacency(PFG, dtype=np.float64)

# Compute cosine similarities from pocket feature score matrix
diagonal = np.sqrt(np.diag(-1*pf_matrix))
denominator = np.outer(diagonal, diagonal)
normalized = (-1*pf_matrix)/denominator

##### Step 2. Filter cosine similarities

In order to reduce memory and computational costs, we filter out cosine slimilarity values that fall outside of a range of interest.  For example, we may wish to exclude low similarity values.  Likewise, we do not wish to have self referencing edges in our graph, so we also filter out diagonal values in the adjacency matrix.

In [23]:
# Specify desired range
score_range = [0.7, 1.0]
lower_limit = score_range[0]
upper_limit = score_range[1]

# Set matrix entries out of specificed range to NaN
if lower_limit > 0:
    normalized[normalized < lower_limit] = np.nan
if upper_limit < 1:
    normalized[normalized > upper_limit] = np.nan

# Set off Diagonal to NaN
np.fill_diagonal(normalized.values, np.nan)

(3384, 3)


##### Step 3. Initialize knowledge graph using similarities

We use the flitered adjacency matrix to initialize a graph where nodes are co-crystal structures and edges are pocket feature "cosine" similarities.

In [None]:
normalized_edges = normalized.stack().reset_index()
normalized_edges = normalized_edges.rename(columns={0:'weight'})
PFG = nx.from_pandas_edgelist(normalized_edges, source='level_0', target='level_1', edge_attr='weight')

##### Step 4. Initialize Ligand Nodes

We will need a figure showing data model used for our graph

In [24]:
# Generate list of tuples (PDB_ID, Ligand_ID)
structures_to_ligands = [i.split('_') for i in list(PFG.nodes)] 

# Unzip tuples into two separate lists
labels, ligands = zip( *structures_to_ligands )

# Create a dictionary for updating pdb sretucture node IDs
fixed_names = {k: v for k, v in zip(PFG.nodes, labels)}
PFG = nx.relabel_nodes(PFG, fixed_names)

# Use list of tuples as an edge list to update the graph (i.e. add ligand nodes)
PFG.add_edges_from(structures_to_ligands, weight=0)

# Compute Scaffold Hop Scores

We define a scaffold hop score for a pair of ligands as:

$$max\left(\frac{F(m_1, m_2)}{T(m_1,m_2)}\right)$$

Where $m_i$ denotes a ligand in our dataset, $F$ is the pocket feature cosine similarity between bound protein structures, and $T$ is the tanomoto coefficient between chemical fingerprints.  Ligands may be bound in multiple co-crystal structures, and for any pair of ligands there may be a range of scaffold hop scores.  We take the maximum value.  In general, the high scaffold hop scores corrspond to ligands with dissimilar strucutres, that bind slimilar protein pockets.


Protocol Steps
1. Iterate through unique ligand pairs
2. Find shortest path between ligands (using cosine distance)
3. Compute scaffold hop score
4. Filter out cofactors, low scaffold hop scores

Finally sort, by scaffold hop score.

In [26]:
# Import ligand tanomoto matrix
ligand_matrix = pd.read_csv('./data/ligand_comparisons.csv', index_col=0)

In [27]:
import itertools

scaffold_hops = []

# Iterate through unique pairs of ligands
for i,j in itertools.combinations(set(ligands[:700]), 2):
    try:
        # Compute dijkstra's shortest path (max length 2.5)
        score, path = nx.single_source_dijkstra(PFG, 
                                                source=i, 
                                                target=j, 
                                                cutoff=2.5, 
                                                weight=lambda u , v, d: (1-d['weight']) 
                                               )
        
        # Filter cases where two ligands map to the same strcuture 
        if score == 2: continue
        
        # Compute scaffold hop score
        ligand_sim = ligand_matrix.loc[i, j]
        sh_score = (3 - score)/(ligand_sim)
        
        # Filer out scaffold hop scores < 1, which are not interesting
        if sh_score < 1: continue
            
        scaffold_hops.append([sh_score, path] )
        
    except nx.NetworkXNoPath:
        pass
    
# Sort paths by scaffold hop scores
sorted(scaffold_hops, key=lambda x: x[0], reverse=True)

[[3.7090795180592058, ['TOL', '1zua', '2pdb', '3s3g', 'TLT']],
 [3.5680618401206634, ['E3O', '2j7y', '1x7r', 'GEN']],
 [3.3935067142614326, ['TOL', '1zua', '4gca', '2X9']],
 [3.35275635275755, ['ASC', '1xid', '1xli', 'GLT']],
 [3.2920030757401, ['7AP', '2g6j', '4fvy', 'H4B']],
 [3.2362680667388033, ['WST', '2pog', '1x7r', 'GEN']],
 [3.097496877016808, ['0L8', '4dma', '2pog', 'WST']],
 [3.0883829430454544, ['WFG', '3wfg', '2aax', 'PDN']],
 [3.0880041114940453, ['TOL', '1zua', '2pdb', 'ZST']],
 [3.0805662491817554, ['SUZ', '4wev', '3s3g', 'TLT']],
 [3.032219694308812, ['RME', '3rme', '2ybu', 'CX9']],
 [3.019298199277709, ['TOL', '1zua', '4wev', 'SUZ']],
 [3.000240500240501, ['0S3', '4eo8', '2hai', 'PFI']],
 [2.9638358969824403, ['ZST', '2pdb', '3s3g', 'TLT']],
 [2.9557690672497885, ['P44', '2pvj', '4dgm', 'AGI']],
 [2.92887325820739, ['555', '2nv7', '2j7y', 'E3O']],
 [2.925113299937984, ['73B', '4bjx', '3aqa', 'BYH']],
 [2.8463757164142387, ['555', '2nv7', '3oll', 'EST']],
 [2.8306215993

# Results

Top scaffold hops.

Subset (a) exact same protein, (b) close homolog, (c) neither


Exact same.  Map to same uniprot accession or ID.
Close homolog. Different Uniprots, but same Pfam annotation, high GO Similarity, Species (paralog vs ortholog).

Show that exact same are ranked highly.  Then filter out.  Same thing with close homologs.

Interesting examples are (a) neither exact or close, (b) multi-hops.

Search for prior information about "interesting scaffold hops" to see if we get anything new.

In [41]:
# path = ['FID', '2agt', '2pdb', 'TLT']
path = ['FID', '2pdb', '3s3g', 'TLT']
n1, n2 = path[1], path[2]
m1, m2 = path[0], path[3]
PFG[n1][n2]

{'weight': 0.9365721434464511}

In [42]:
from pprint import pprint
pprint([pdb_mol(n1), pdb_file(n1)] )
pprint([pdb_mol(n2), pdb_file(n2)] )
pprint([pdb_ligand(m1), pdb_ligand(m2)])

[{'id': '2PDB',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec': '1.1.1.21'},
                           'length': '316',
                           'mutation': {'desc': 'L4I, F121P'},
                           'polymerDescription': {'description': 'Aldose '
                                                                 'reductase'}}]},
 [{'expMethod': 'X-RAY DIFFRACTION',
   'keywords': 'OXIDOREDUCTASE',
   'nr_atoms': '2607',
   'nr_entities': '3',
   'nr_residues': '316',
   'organism': 'Homo sapiens',
   'publish_date': '2007-03-31',
   'pubmedId': '18495158',
   'replaces': '',
   'revision_date': '2008-04-01',
   'status': 'CURRENT',
   'structureId': '2PDB',
   'title': 'Human aldose reductase mutant F121P complexed with zopolrestat.'}]]
[{'id': '3S3G',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec

In [43]:
pdb_uniprot(n1)

From	To
2pdb	P15121



In [44]:
pdb_uniprot(n2)

From	To
3s3g	P15121



Top scaffold hops.

