# Preamble
##### Library imports and helper functions used in scripts.  Mostly for API calls.

In [17]:
import pandas as pd
import networkx as nx
import numpy as np
import requests

def fetch(url):
    response = requests.get(url)
    if response.status_code != 200:
        return {}
    else: 
        return response.json()    

def pdb_file(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describePDB?structureId={pdb_id}'
    return fetch(request)

def pdb_mol(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describeMol?structureId={pdb_id}'
    return fetch(request)

def pdb_ligand(lig_id):
    request = f'https://www.rcsb.org/pdb/json/describeHet?chemicalID={lig_id}'
    return fetch(request)

def pdb_go_terms(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/goTerms?structureId={pdb_id}'
    return fetch(request)

import urllib.parse
import urllib.request

def pdb_uniprot(pdb_id):
    url = 'https://www.uniprot.org/uploadlists/'

    params = {
    'from': 'PDB_ID',
    'to': 'ACC',
    'format': 'tab',
    'query': pdb_id
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
       response = f.read()
    print(response.decode('utf-8'))
    return

# KG Set Up

## Normalize PF Similarties

##### Step 1. Convert from nxn "raw similarities" to nxn "cosine similarities"
##### Step 2. Filter out similarities we don't like
##### Step 3. Initialize knowledge graph using similarities

In [14]:
pf = pd.read_csv('./data/pocket_feature_scores.csv', header=None, names=['pocket_0', 'pocket_1', 'weight'])
PFG = nx.from_pandas_edgelist(pf, source='pocket_0', target='pocket_1', edge_attr='weight')

pf_matrix = nx.to_pandas_adjacency(PFG, dtype=np.float64)

diagonal = np.sqrt(np.diag(-1*pf_matrix))
denominator = np.outer(diagonal, diagonal)
normalized = (-1*pf_matrix)/denominator

normalized[normalized < 0.7] = np.nan
# normalized[normalized > 0.6] = np.nan


np.fill_diagonal(normalized.values, np.nan)
normalized_edges = normalized.stack().reset_index()
normalized_edges = normalized_edges.rename(columns={0:'weight'})
PFG = nx.from_pandas_edgelist(normalized_edges, source='level_0', target='level_1', edge_attr='weight')
# PFG['3qfz_NOJ']['4zlg_LGC']
print(normalized_edges.shape)

(3384, 3)


##### Initialize Ligand Nodes

We will need a figure showing data model used for our graph

In [15]:
structures_to_ligands = [i.split('_') for i in list(PFG.nodes)] 
labels, ligands = zip( *structures_to_ligands )
fixed_names = {k: v for k, v in zip(PFG.nodes, labels)}
PFG = nx.relabel_nodes(PFG, fixed_names)
# PFG['3qfz']['4zlg']

In [16]:
PFG.add_edges_from(structures_to_ligands, weight=0)

In [18]:
ligand_matrix = pd.read_csv('./data/ligand_comparisons.csv', index_col=0)

##### API calls For later

We use PDB and Uniprot APIs to grab data when we need to.

## Compute Scaffold Hop Scores

1. Iterate through unique ligand pairs
2. Find shortest path between ligands (using cosine distance)
3. Compute scaffold hop score
4. Filter out cofactors, low scaffold hop scores

Finally sort, by scaffold hop score.

In [19]:
import itertools

scaffold_hops = []
for i,j in itertools.combinations(set(ligands[:700]), 2):
    try:
        score, path = nx.single_source_dijkstra(PFG, 
                                                source=i, 
                                                target=j, 
                                                cutoff=2.5, 
                                                weight=lambda u , v, d: (1-d['weight']) 
                                               )
        if score == 2: continue
        ligand_sim = ligand_matrix.loc[i, j]
        sh_score = (3 - score)/(ligand_sim)
        
        if sh_score < 1: continue
            
        scaffold_hops.append([sh_score, path] )
        
    except nx.NetworkXNoPath:
        pass
sorted(scaffold_hops, key=lambda x: x[0], reverse=True)

[[3.7090795180592058, ['TOL', '1zua', '2pdb', '3s3g', 'TLT']],
 [3.5680618401206634, ['GEN', '1x7r', '2j7y', 'E3O']],
 [3.3935067142614326, ['TOL', '1zua', '4gca', '2X9']],
 [3.35275635275755, ['GLT', '1xli', '1xid', 'ASC']],
 [3.2920030757401, ['H4B', '4fvy', '2g6j', '7AP']],
 [3.2362680667388033, ['GEN', '1x7r', '2pog', 'WST']],
 [3.097496877016808, ['0L8', '4dma', '2pog', 'WST']],
 [3.0883829430454544, ['PDN', '2aax', '3wfg', 'WFG']],
 [3.0880041114940453, ['TOL', '1zua', '2pdb', 'ZST']],
 [3.0805662491817554, ['TLT', '3s3g', '4wev', 'SUZ']],
 [3.032219694308812, ['CX9', '2ybu', '3rme', 'RME']],
 [3.019298199277709, ['TOL', '1zua', '4wev', 'SUZ']],
 [3.000240500240501, ['0S3', '4eo8', '2hai', 'PFI']],
 [2.9638358969824403, ['ZST', '2pdb', '3s3g', 'TLT']],
 [2.9557690672497885, ['AGI', '4dgm', '2pvj', 'P44']],
 [2.92887325820739, ['E3O', '2j7y', '2nv7', '555']],
 [2.925113299937984, ['BYH', '3aqa', '4bjx', '73B']],
 [2.8463757164142387, ['EST', '3oll', '2nv7', '555']],
 [2.8306215993

# Results

Top scaffold hops.

Subset (a) exact same protein, (b) close homolog, (c) neither


Exact same.  Map to same uniprot accession or ID.
Close homolog.  Sequence alignmed, SCOP classes, EC Classes, Pfam annotation, Species.

Show that exact same are ranked highly.  Then filter out.  Same thing with close homologs.

Intersting examples are (a) neither, (b) multi-hop cases.

Search for prior information about "interesting scaffold hops" to see if we get anything new.

In [47]:
path = ['VD4', '2o4j', '4fhh', '0U3']
# path = ['TOL', '2pdb', '3s3g', 'TLT']
n1, n2 = path[1], path[2]
m1, m2 = path[0], path[3]
PFG[n1][n2]

{'weight': 0.7763767984837416}

In [48]:
from pprint import pprint
pprint([pdb_mol(n1), pdb_file(n1)] )
pprint([pdb_mol(n2), pdb_file(n2)] )
pprint(pdb_ligand(m1) + pdb_ligand(m2))

[{'id': '2O4J',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'fragment': {'desc': 'ligand binding domain'},
                           'length': '292',
                           'mutation': {'desc': '&Delta;S165, &Delta;Y166, '
                                                '&Delta;S167, &Delta;P168, '
                                                '&Delta;R169, &Delta;P170, '
                                                '&Delta;T171, &Delta;L172, '
                                                '&Delta;S173, &Delta;F174, '
                                                '&Delta;S175, &Delta;G176, '
                                                '&Delta;N177, &Delta;S178, '
                                                '&Delta;S179, &Delta;S180, '
                                                '&Delta;S181, &Delta;S182, '
                                                '&Delta;S183, &Delta;D184, '
   

In [50]:
pdb_uniprot(n1)

From	To
2o4j	P13053
2o4j	Q15648



In [51]:
pdb_uniprot(n2)

From	To
4fhh	Q15596
4fhh	Q9PTN2



Top scaffold hops.

