# Preamble
##### Library imports and helper functions used in scripts.  Mostly for API calls.

In [34]:
import pandas as pd
import networkx as nx
import numpy as np
import requests

def fetch(url):
    response = requests.get(url)
    if response.status_code != 200:
        return {}
    else: 
        return response.json()    
    
def pdb_file(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describePDB?structureId={pdb_id}'
    return fetch(request)

def pdb_mol(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describeMol?structureId={pdb_id}'
    return fetch(request)

def pdb_ligand(lig_id):
    request = f'https://www.rcsb.org/pdb/json/describeHet?chemicalID={lig_id}'
    return fetch(request)

def pdb_go_terms(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/goTerms?structureId={pdb_id}'
    return fetch(request)

import urllib.parse
import urllib.request

def pdb_uniprot(pdb_id):
    url = 'https://www.uniprot.org/uploadlists/'

    params = {
    'from': 'PDB_ID',
    'to': 'ACC',
    'format': 'tab',
    'query': pdb_id
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
       response = f.read()
    print(response.decode('utf-8'))
    return

In [22]:
from xml.etree import ElementTree
import pprint
pp = pprint.PrettyPrinter()

def pfam(pdb_id):
    url = f'https://www.rcsb.org/pdb/rest/hmmer?structureId={pdb_id}'
    return xml_response(url)

def xml_response(url):
    response = requests.get(url)
    tree = ElementTree.fromstring(response.content)
    return tree[0].attrib

pp.pprint(pfam('1xkk'))

{'chainId': 'A',
 'eValue': '0.0',
 'pdbResNumEnd': '965',
 'pdbResNumStart': '713',
 'pfamAcc': 'PF07714.16',
 'pfamDesc': 'Protein tyrosine kinase',
 'pfamName': 'Pkinase_Tyr',
 'structureId': '1XKK'}


# KG Set Up

## Normalize PF Similarties

##### Step 1. Convert from nxn "raw similarities" to nxn "cosine similarities"
##### Step 2. Filter out similarities we don't like
##### Step 3. Initialize knowledge graph using similarities

In [23]:
pf = pd.read_csv('./data/pocket_feature_scores.csv', header=None, names=['pocket_0', 'pocket_1', 'weight'])
PFG = nx.from_pandas_edgelist(pf, source='pocket_0', target='pocket_1', edge_attr='weight')

pf_matrix = nx.to_pandas_adjacency(PFG, dtype=np.float64)

diagonal = np.sqrt(np.diag(-1*pf_matrix))
denominator = np.outer(diagonal, diagonal)
normalized = (-1*pf_matrix)/denominator

normalized[normalized < 0.7] = np.nan
# normalized[normalized > 0.6] = np.nan


np.fill_diagonal(normalized.values, np.nan)
normalized_edges = normalized.stack().reset_index()
normalized_edges = normalized_edges.rename(columns={0:'weight'})
PFG = nx.from_pandas_edgelist(normalized_edges, source='level_0', target='level_1', edge_attr='weight')
# PFG['3qfz_NOJ']['4zlg_LGC']
print(normalized_edges.shape)

(3384, 3)


##### Initialize Ligand Nodes

We will need a figure showing data model used for our graph

In [24]:
structures_to_ligands = [i.split('_') for i in list(PFG.nodes)] 
labels, ligands = zip( *structures_to_ligands )
fixed_names = {k: v for k, v in zip(PFG.nodes, labels)}
PFG = nx.relabel_nodes(PFG, fixed_names)
# PFG['3qfz']['4zlg']

In [25]:
PFG.add_edges_from(structures_to_ligands, weight=0)

In [26]:
ligand_matrix = pd.read_csv('./data/ligand_comparisons.csv', index_col=0)

##### API calls For later

We use PDB and Uniprot APIs to grab data when we need to.

## Compute Scaffold Hop Scores

1. Iterate through unique ligand pairs
2. Find shortest path between ligands (using cosine distance)
3. Compute scaffold hop score
4. Filter out cofactors, low scaffold hop scores

Finally sort, by scaffold hop score.

In [27]:
import itertools

scaffold_hops = []
for i,j in itertools.combinations(set(ligands[:700]), 2):
    try:
        score, path = nx.single_source_dijkstra(PFG, 
                                                source=i, 
                                                target=j, 
                                                cutoff=2.5, 
                                                weight=lambda u , v, d: (1-d['weight']) 
                                               )
        if score == 2: continue
        ligand_sim = ligand_matrix.loc[i, j]
        sh_score = (3 - score)/(ligand_sim)
        
        if sh_score < 1: continue
            
        scaffold_hops.append([sh_score, path] )
        
    except nx.NetworkXNoPath:
        pass
sorted(scaffold_hops, key=lambda x: x[0], reverse=True)

[[3.7090795180592058, ['TOL', '1zua', '2pdb', '3s3g', 'TLT']],
 [3.5680618401206634, ['E3O', '2j7y', '1x7r', 'GEN']],
 [3.3935067142614326, ['TOL', '1zua', '4gca', '2X9']],
 [3.35275635275755, ['ASC', '1xid', '1xli', 'GLT']],
 [3.2920030757401, ['7AP', '2g6j', '4fvy', 'H4B']],
 [3.2362680667388033, ['WST', '2pog', '1x7r', 'GEN']],
 [3.097496877016808, ['0L8', '4dma', '2pog', 'WST']],
 [3.0883829430454544, ['WFG', '3wfg', '2aax', 'PDN']],
 [3.0880041114940453, ['TOL', '1zua', '2pdb', 'ZST']],
 [3.0805662491817554, ['SUZ', '4wev', '3s3g', 'TLT']],
 [3.032219694308812, ['RME', '3rme', '2ybu', 'CX9']],
 [3.019298199277709, ['TOL', '1zua', '4wev', 'SUZ']],
 [3.000240500240501, ['0S3', '4eo8', '2hai', 'PFI']],
 [2.9638358969824403, ['ZST', '2pdb', '3s3g', 'TLT']],
 [2.9557690672497885, ['P44', '2pvj', '4dgm', 'AGI']],
 [2.92887325820739, ['555', '2nv7', '2j7y', 'E3O']],
 [2.925113299937984, ['73B', '4bjx', '3aqa', 'BYH']],
 [2.8463757164142387, ['555', '2nv7', '3oll', 'EST']],
 [2.8306215993

# Results

Top scaffold hops.

Subset (a) exact same protein, (b) close homolog, (c) neither


Exact same.  Map to same uniprot accession or ID.
Close homolog. Different Uniprots, but same Pfam annotation, high GO Similarity, Species (paralog vs ortholog).

Show that exact same are ranked highly.  Then filter out.  Same thing with close homologs.

Interesting examples are (a) neither exact or close, (b) multi-hops.

Search for prior information about "interesting scaffold hops" to see if we get anything new.

In [41]:
# path = ['FID', '2agt', '2pdb', 'TLT']
path = ['FID', '2pdb', '3s3g', 'TLT']
n1, n2 = path[1], path[2]
m1, m2 = path[0], path[3]
PFG[n1][n2]

{'weight': 0.9365721434464511}

In [42]:
from pprint import pprint
pprint([pdb_mol(n1), pdb_file(n1)] )
pprint([pdb_mol(n2), pdb_file(n2)] )
pprint([pdb_ligand(m1), pdb_ligand(m2)])

[{'id': '2PDB',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec': '1.1.1.21'},
                           'length': '316',
                           'mutation': {'desc': 'L4I, F121P'},
                           'polymerDescription': {'description': 'Aldose '
                                                                 'reductase'}}]},
 [{'expMethod': 'X-RAY DIFFRACTION',
   'keywords': 'OXIDOREDUCTASE',
   'nr_atoms': '2607',
   'nr_entities': '3',
   'nr_residues': '316',
   'organism': 'Homo sapiens',
   'publish_date': '2007-03-31',
   'pubmedId': '18495158',
   'replaces': '',
   'revision_date': '2008-04-01',
   'status': 'CURRENT',
   'structureId': '2PDB',
   'title': 'Human aldose reductase mutant F121P complexed with zopolrestat.'}]]
[{'id': '3S3G',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec

In [43]:
pdb_uniprot(n1)

From	To
2pdb	P15121



In [44]:
pdb_uniprot(n2)

From	To
3s3g	P15121



Top scaffold hops.

