In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import requests

Notes.  Query should have a range.  Get all edges between (x, y).  

Looks like similarity around .9 gives same protein.  0,7

In [61]:
pf = pd.read_csv('./data/pocket_feature_scores.csv', header=None, names=['pocket_0', 'pocket_1', 'weight'])
PFG = nx.from_pandas_edgelist(pf, source='pocket_0', target='pocket_1', edge_attr='weight')

pf_matrix = nx.to_pandas_adjacency(PFG, dtype=np.float64)

diagonal = np.sqrt(np.diag(-1*pf_matrix))
denominator = np.outer(diagonal, diagonal)
normalized = (-1*pf_matrix)/denominator

normalized[normalized < 0.3] = np.nan
normalized[normalized > 0.6] = np.nan


np.fill_diagonal(normalized.values, np.nan)
normalized_edges = normalized.stack().reset_index()
normalized_edges = normalized_edges.rename(columns={0:'weight'})
PFG = nx.from_pandas_edgelist(normalized_edges, source='level_0', target='level_1', edge_attr='weight')
# PFG['3qfz_NOJ']['4zlg_LGC']
print(normalized_edges.shape)

(85648, 3)


In [62]:
structures_to_ligands = [i.split('_') for i in list(PFG.nodes)] 
labels, ligands = zip( *structures_to_ligands )
fixed_names = {k: v for k, v in zip(PFG.nodes, labels)}
PFG = nx.relabel_nodes(PFG, fixed_names)
# PFG['3qfz']['4zlg']

In [63]:
PFG.add_edges_from(structures_to_ligands, weight=0)

In [64]:
def fetch(url):
    response = requests.get(url)
    if response.status_code != 200:
        return {}
    else: 
        return response.json()    

def pdb_file(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describePDB?structureId={pdb_id}'
    return fetch(request)

def pdb_mol(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/describeMol?structureId={pdb_id}'
    return fetch(request)

def pdb_ligand(lig_id):
    request = f'https://www.rcsb.org/pdb/json/describeHet?chemicalID={lig_id}'
    return fetch(request)

def pdb_go_terms(pdb_id):
    request = f'https://www.rcsb.org/pdb/json/goTerms?structureId={pdb_id}'
    return fetch(request)

import urllib.parse
import urllib.request

def pdb_uniprot(pdb_id):
    url = 'https://www.uniprot.org/uploadlists/'

    params = {
    'from': 'PDB_ID',
    'to': 'ACC',
    'format': 'tab',
    'query': pdb_id
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
       response = f.read()
    print(response.decode('utf-8'))
    return

In [65]:
ligand_matrix = pd.read_csv('./data/ligand_comparisons.csv', index_col=0)

In [66]:
import itertools

scaffold_hops = []
for i,j in itertools.combinations(set(ligands[:700]), 2):
    try:
        score, path = nx.single_source_dijkstra(PFG, 
                                                source=i, 
                                                target=j, 
                                                cutoff=2.5, 
                                                weight=lambda u , v, d: (1-d['weight']) 
                                               )
        if score == 2: continue
        ligand_sim = ligand_matrix.loc[i, j]
        sh_score = (3 - score)/(ligand_sim)
        
        if sh_score < 1: continue
            
        scaffold_hops.append([sh_score, path] )
        
    except nx.NetworkXNoPath:
        pass
sorted(scaffold_hops, key=lambda x: x[0], reverse=True)

[[2.201493772925913, ['Z3R', '4au8', '4fkl', 'CK2']],
 [2.1912288461769376, ['609', '2zv2', '4fkl', 'CK2']],
 [2.1242169112718776, ['FEF', '3mtl', '4fkl', 'CK2']],
 [2.0139204768227073, ['0X5', '4gih', '3zc6', 'VFC']],
 [1.9990138067061136, ['GVD', '2vn9', '4fkl', 'CK2']],
 [1.9648332196113023, ['SJJ', '4afj', '3e5a', 'VX6']],
 [1.944923728497329, ['0X5', '4gih', '4bbf', 'O19']],
 [1.9411734091735795, ['0X5', '4gih', '4cki', 'ADN']],
 [1.9276287782034907, ['Z3R', '4au8', '2zv2', '609']],
 [1.9239797337567082, ['BDY', '3lcd', '3cjf', 'SAV']],
 [1.9174173334969729, ['KZI', '3ac1', '4lgg', 'VGG']],
 [1.902294395131512, ['4B0', '3ofm', '3juh', 'ANP']],
 [1.8987417001211748, ['ADN', '4cki', '3cjf', 'SAV']],
 [1.861277754058638, ['0X5', '4gih', '2bdf', '24A']],
 [1.837656347251599, ['ADP', '3niz', '4fkl', 'CK2']],
 [1.8338388093024958, ['MPZ', '1y57', '4ckj', 'ADN']],
 [1.833307227328713, ['Z3R', '4au8', '3mtl', 'FEF']],
 [1.8267622546820874, ['KZI', '3ac1', '4k11', '0J9']],
 [1.790223608430

In [75]:
path = ['BDY', '3lcd', '3cjf', 'SAV']
n1, n2 = path[1], path[2]
m1, m2 = path[0], path[3]
PFG[n1][n2]

{'weight': 0.5502582038544186}

In [76]:
from pprint import pprint
pprint([pdb_mol(n1), pdb_mol(n2)] )
pprint(pdb_ligand(m1) + pdb_ligand(m2))

[{'id': '3LCD',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec': '2.7.10.1'},
                           'fragment': {'desc': 'Kinase Domain'},
                           'length': '329',
                           'polymerDescription': {'description': 'Macrophage '
                                                                 'colony-stimulating '
                                                                 'factor 1 '
                                                                 'receptor'}}]},
 {'id': '3CJF',
  'polymerDescriptions': [{'chain': {'id': 'A'},
                           'entityNr': '1',
                           'enzClass': {'ec': '2.7.10.1'},
                           'fragment': {'desc': 'kinase domain; residues '
                                                '806-939 and 994-1168'},
                           'length': '309',
                           'mutation':