In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import requests

Notes.  Query should have a range.  Get all edges between (x, y).  

Looks like similarity around .9 gives same protein.  0,7

In [2]:
pf = pd.read_csv('./pocket_feature_scores.csv', header=None, names=['pocket_0', 'pocket_1', 'weight'])
PFG = nx.from_pandas_edgelist(pf, source='pocket_0', target='pocket_1', edge_attr='weight')

pf_matrix = nx.to_pandas_adjacency(PFG, dtype=np.float64)

diagonal = np.sqrt(np.diag(-1*pf_matrix))
denominator = np.outer(diagonal, diagonal)
normalized = (-1*pf_matrix)/denominator

normalized[normalized < 0.3] = np.nan

np.fill_diagonal(normalized.values, np.nan)
normalized_edges = normalized.stack().reset_index()
normalized_edges = normalized_edges.rename(columns={0:'weight'})
PFG = nx.from_pandas_edgelist(normalized_edges, source='level_0', target='level_1', edge_attr='weight')
# PFG['3qfz_NOJ']['4zlg_LGC']
print(normalized_edges.shape)

(91792, 3)


In [3]:
structures_to_ligands = [i.split('_') for i in list(PFG.nodes)] 
labels, ligands = zip( *structures_to_ligands )
fixed_names = {k: v for k, v in zip(PFG.nodes, labels)}
PFG = nx.relabel_nodes(PFG, fixed_names)
# PFG['3qfz']['4zlg']

In [4]:
PFG.add_edges_from(structures_to_ligands, weight=0)

In [9]:
def fetch(url):
    response = requests.get(url)
    if response.status_code != 200:
        return {}
    else: 
        return response.json()    

def pdb_file(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describePDB?structureId={pdb_id}'
    return fetch(request)

def pdb_mol(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describeMol?structureId={pdb_id}'
    return fetch(request)

def pdb_ligand(lig_id):
    request = f'https://www.rcsb.org/pdb/json/describeHet?chemicalID={lig_id}'
    return fetch(request)

def pdb_go_terms(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/goTerms?structureId={pdb_id}'
    return fetch(request)

import urllib.parse
import urllib.request

def pdb_uniprot(pdb_id):
    url = 'https://www.uniprot.org/uploadlists/'

    params = {
    'from': 'PDB_ID',
    'to': 'ACC',
    'format': 'tab',
    'query': pdb_id
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
       response = f.read()
    print(response.decode('utf-8'))
    return

In [None]:
ligand_matrix = pd.read_csv('./ligand_comparisons.csv', index_col=0)

In [52]:
import itertools

scaffold_hops = []
for i,j in itertools.combinations(set(ligands[:700]), 2):
    try:
        score, path = nx.single_source_dijkstra(PFG, 
                                                source=i, 
                                                target=j, 
                                                cutoff=2.5, 
                                                weight=lambda u , v, d: (1-d['weight']) 
                                               )
        if score == 2: continue
        ligand_sim = ligand_matrix.loc[i, j]
        sh_score = (3 - score)/(ligand_sim)
        
        if sh_score < 1: continue
            
        scaffold_hops.append([sh_score, path] )
        
    except nx.NetworkXNoPath:
        pass
sorted(scaffold_hops, key=lambda x: x[0], reverse=True)

[[2.6179784017672776, ['1J5', '4iva', '4gih', '0X5']],
 [2.5988371574488562, ['P44', '2pvj', '3ofm', '4B0']],
 [2.5228434580844117, ['IZA', '3lxp', '4gih', '0X5']],
 [2.4567480479615265, ['M77', '2f2u', '3v8s', '0HD']],
 [2.4219433281466536, ['ATP', '5csh', '3ofm', '4B0']],
 [2.4171380044287654, ['MPZ', '1y57', '2bdf', '24A']],
 [2.3860452401670305, ['XIN', '3c7q', '3cjf', 'SAV']],
 [2.3518477453286493, ['1J5', '4iva', '4bbf', 'O19']],
 [2.3234454660437684, ['KZI', '3ac1', '1y57', 'MPZ']],
 [2.311441409644402, ['MPZ', '1y57', '4lgg', 'VGG']],
 [2.2648947048584596, ['AXI', '4twp', '2f4j', 'VX6']],
 [2.2522893369850863, ['1J5', '4iva', '3lxk', 'MI1']],
 [2.2286393420639112, ['4B0', '3ofm', '3nsz', 'ANP']],
 [2.201493772925913, ['CK2', '4fkl', '4au8', 'Z3R']],
 [2.1912288461769376, ['CK2', '4fkl', '2zv2', '609']],
 [2.169116566235516, ['MPZ', '1y57', '3dqw', 'AGS']],
 [2.1242169112718776, ['FEF', '3mtl', '4fkl', 'CK2']],
 [2.1124751984126986, ['O22', '3vqu', '4c4f', '7CE']],
 [2.049991704

In [66]:
n1, n2 = '4iva', '4bbf'
PFG[n1][n2]

{'weight': 0.6702766074186648}

In [67]:
from pprint import pprint
pprint([pdb_mol(n1), pdb_mol(n2)] )

[{'id': '4IVA',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec': '2.7.10.2'},
                           'fragment': {'desc': 'UNP residues 833-1132'},
                           'length': '300',
                           'polymerDescription': {'description': 'Tyrosine-protein '
                                                                 'kinase '
                                                                 'JAK2'}}]},
 {'id': '4BBF',
  'polymerDescriptions': [{'chain': [{'id': 'A'},
                                     {'id': 'B'},
                                     {'id': 'C'},
                                     {'id': 'D'}],
                           'entityNr': '1',
                           'enzClass': {'ec': '2.7.10.2'},
                           'fragment': {'desc': 'PROTEIN TYROSINE KINASE '
                                                'DOMAIN, RESIDUES 839-1132'},
    