# Graph Construction

This notebook holds code for constructing the graph for the pocket feature REST API.

In [1]:
import pandas as pd
import networkx as nx
import numpy as np

## Import Node Info

In [2]:
all_info = pd.read_csv('./data/scPDB_Results.tsv', sep='\t')
all_info['Uniprot_Name'] = all_info['Uniprot_Name'].str.split(pat = "//").str[-1]
all_info['Uniprot_AC'] = all_info['Uniprot_AC'].str.split(pat = "//").str[-1]
all_info['Uniprot_ID'] = all_info['Uniprot_ID'].str.split(pat = "//").str[-1]

In [3]:
all_info.head()

Unnamed: 0,PDB_ID,Site_Number,Deposition_Date,Chimeric_entry,Experimental_Method,Chains,ChainPercentageInSite,Uniprot_Name,Uniprot_AC,Uniprot_ID,...,IF_bit_7,IF_bit_8,ClusterID,Cluster_Name,Cavity_Volume,Cavity_Hydrophobicity,Cavity_Polar,Cavity_Dummy,Cavity_Ligand_Recovery,Ligand_Cavity_Recovery
0,11bg,2,1999-03-11,0.0,XRay,A//B,82.00//18.00,Seminal ribonuclease,P00669,RNS_BOVIN,...,0.0,0.0,,,482.625,20.2797,60.1399,19.5804,0.0,30.0699
1,12gs,1,1997-11-19,0.0,XRay,A//B,93.00//7.00,Glutathione S-transferase P,P09211,GSTP1_HUMAN,...,1.0,0.0,,,975.375,34.9481,50.519,14.5329,0.0,20.0692
2,13gs,1,1997-11-20,0.0,XRay,A,100.00,Glutathione S-transferase P,P09211,GSTP1_HUMAN,...,0.0,0.0,,,857.25,31.8898,55.9055,12.2047,0.0,21.6535
3,17gs,1,1997-12-07,0.0,XRay,A//B,93.00//7.00,Glutathione S-transferase P,P09211,GSTP1_HUMAN,...,1.0,0.0,,,999.0,32.0946,56.7568,11.1486,0.0,21.6216
4,1a26,1,1998-01-16,0.0,XRay,A,100.00,Poly [ADP-ribose] polymerase 1,P26446,PARP1_CHICK,...,0.0,0.0,,,837.0,43.9516,45.1613,10.8871,0.0,4.0323


## Import Pocket Feature Scores

In [4]:
pf = pd.read_csv('./data/pocket_feature_scores.csv', header=None, names=['pocket_0', 'pocket_1', 'weight'])

In [5]:
pf.head()

Unnamed: 0,pocket_0,pocket_1,weight
0,1v1a_KDG,1v1a_KDG,-11.4
1,1v25_ANP,1v1a_KDG,-2.065
2,1v25_ANP,1v25_ANP,-18.6
3,1v3s_ATP,1v1a_KDG,-1.515
4,1v3s_ATP,1v25_ANP,-3.144


In [6]:
PFG = nx.from_pandas_edgelist(pf, source='pocket_0', target='pocket_1', edge_attr='weight')

## Normalize Pocket Feature Scores

In [8]:
pf_matrix = nx.to_pandas_adjacency(PFG, dtype=np.float64)

diagonal = np.sqrt(np.diag(-1*pf_matrix))
denominator = np.outer(diagonal, diagonal)
normalized = 1 - (-1*pf_matrix)/denominator

np.fill_diagonal(normalized.values, np.nan)
normalized_edges = normalized.stack().reset_index()
normalized_edges = normalized_edges.rename(columns={0:'weight'})

In [9]:
PFG = nx.from_pandas_edgelist(normalized_edges, source='level_0', target='level_1', edge_attr='weight')
PFG['1v25_ANP']['1v1a_KDG']

{'weight': 0.8581885955263642}

## Add PDB Structure Info

In [10]:
labels = [i.split('_')[0] for i in list(PFG.nodes)]
fixed_names = {k: v for k, v in zip(PFG.nodes, labels)}
PFG = nx.relabel_nodes(PFG, fixed_names)
PFG['1v25']['1v1a']

{'weight': 0.8581885955263642}

In [11]:
pocket_info = all_info.loc[:, ['PDB_ID', 'HET_CODE', 'Uniprot_AC', 'Deposition_Date', 'Experimental_Method', 'Chains', 'ChainPercentageInSite', 'Species']]
pocket_info = pocket_info.set_index('PDB_ID')

pocket_info.loc[list(PFG.nodes)]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.


Unnamed: 0_level_0,HET_CODE,Uniprot_AC,Deposition_Date,Experimental_Method,Chains,ChainPercentageInSite,Species
PDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1v1a,KDG,Q53W83,2004-04-12,XRay,A,100.00,Thermus thermophilus
1v25,ANP,Q5SKN9,2003-10-07,XRay,A,100.00,Thermus thermophilus
1v3s,ATP,P83820,2003-11-05,XRay,A//B,58.00//42.00,Thermus thermophilus//Thermus thermophilus
1v59,NAD,P09624,2003-11-21,XRay,A,100.00,Saccharomyces cerevisiae
1v59,FAD,P09624,2003-11-21,XRay,A//B,5.00//95.00,Saccharomyces cerevisiae//Saccharomyces cerevi...
...,...,...,...,...,...,...,...
5yzh,,,,,,,
6cny,,,,,,,
6d28,,,,,,,
6gsw,GPS,P04905,1996-01-26,XRay,A//B,91.00//9.00,Rattus norvegicus//Rattus norvegicus


In [31]:
pocket_info = all_info.loc[:, ['PDB_ID', 'HET_CODE', 'Uniprot_AC', 'Deposition_Date', 'Experimental_Method', 'Chains', 'ChainPercentageInSite', 'Species']]

pocket_info = pocket_info.set_index('PDB_ID')
pocket_info = pocket_info.drop_duplicates()

pocket_dict = pocket_info.to_dict()
for col in pocket_dict:
    nx.set_node_attributes(PFG, pocket_dict[col], col)
nx.set_node_attributes(PFG, 'protein_structure', name='Type')
PFG.nodes['1v25']

{'HET_CODE': 'ANP',
 'Uniprot_AC': 'Q5SKN9',
 'Deposition_Date': '2003-10-07',
 'Experimental_Method': 'XRay',
 'Chains': 'A',
 'ChainPercentageInSite': '100.00',
 'Species': 'Thermus thermophilus',
 'Type': 'protein_structure'}

## Import Ligand Tanimotos

In [11]:
ligand_matrix = pd.read_csv('./data/ligand_comparisons.csv', index_col=0)
np.fill_diagonal(ligand_matrix.values, np.nan)
ligand_edges = ligand_matrix.stack().reset_index()
ligand_edges = ligand_edges.rename(columns={'level_0':'ligand_0', 'level_1':'ligand_1', 0:'weight'})

In [12]:
Lig = nx.from_pandas_edgelist(ligand_edges, source='ligand_0', target='ligand_1', edge_attr='weight')

In [13]:
lig_info = all_info.loc[:, ['HET_CODE' ,'SMILES', 'InChI']]
lig_info = lig_info.drop_duplicates()
lig_info = lig_info.set_index('HET_CODE')

info_dict = lig_info.to_dict()
for col in info_dict:
    lig_info = nx.set_node_attributes(Lig, info_dict[col], col)
    
nx.set_node_attributes(Lig, 'chemical', name='Type')


In [14]:
Lig.nodes['00A']

{'SMILES': 'Nc1ncnc2c1ncn2C3OC(COP(=O)([O-])OC(=O)c4ccc(Cl)cc4)C(O)C3O',
 'InChI': 'InChI=1S/C17H17ClN5O8P/c18-9-3-1-8(2-4-9)17(26)31-32(27,28)29-5-10-12(24)13(25)16(30-10)23-7-22-11-14(19)20-6-21-15(11)23/h1-4,6-7,10,12-13,16,24-25H,5H2,(H,27,28)(H2,19,20,21)/p-1/t10-,12-,13-,16-/m1/s1',
 'Type': 'chemical'}

## Import Sequences

In [15]:
seq_info = all_info.loc[:, ['PDB_ID','Uniprot_Name', 'Uniprot_AC', 'Uniprot_ID']]
seq_info = seq_info.drop_duplicates()
seq_info = seq_info.set_index('Uniprot_ID')
seq_dict = seq_info.to_dict()

In [16]:
seq_matrix = pd.read_csv('./data/seq_align_matrix.csv', index_col=0, header=0)
seq_matrix = seq_matrix.loc[seq_info.index, seq_info.index]
del seq_matrix.index.name
seq_matrix = seq_matrix.loc[:,~seq_matrix.columns.duplicated()]
seq_matrix = seq_matrix.loc[~seq_matrix.index.duplicated(),:]
seq_matrix = seq_matrix/100
np.fill_diagonal(seq_matrix.values, np.nan)
seq_edges = seq_matrix.stack().reset_index()
seq_edges = seq_edges.rename(columns={'level_0':'ligand_0', 'level_1':'ligand_1', 0:'weight'})

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [17]:
Seq = nx.from_pandas_edgelist(seq_edges, source='ligand_0', target='ligand_1', edge_attr='weight')

In [18]:
for col in seq_dict:
    nx.set_node_attributes(Seq, seq_dict[col], col)

nx.set_node_attributes(Seq, 'protein', name='Type')

In [19]:
Seq.nodes['12S_PROFR']

{'PDB_ID': '1on3',
 'Uniprot_Name': 'Methylmalonyl-CoA carboxyltransferase 12S subunit',
 'Uniprot_AC': 'Q8GBW6',
 'Type': 'protein'}

In [20]:
labels = [Seq.nodes[node]['Uniprot_AC'] for node in Seq.nodes]
fixed_names = {k: v for k, v in zip(Seq.nodes, labels)}
Seq = nx.relabel_nodes(Seq, fixed_names)

In [21]:
Seq.nodes['Q8GBW6']

{'PDB_ID': '1on3',
 'Uniprot_Name': 'Methylmalonyl-CoA carboxyltransferase 12S subunit',
 'Uniprot_AC': 'Q8GBW6',
 'Type': 'protein'}

### Connect Ligands to Pockets

In [37]:
G = PFG.copy()
count = 0
names = []
for node in PFG.nodes:
    try:
        het_code = PFG.nodes[node]['HET_CODE']
        G.add_node(het_code, **Lig.nodes[het_code])
        G.add_edge(het_code, node, weight=0)
    except KeyError:
        count += 1
        names.append(node)
count

152

In [42]:
PFG.nodes['2huv']

{'Type': 'protein_structure'}

In [39]:
names

['1x31',
 '1xyy',
 '1y7l',
 '2aqi',
 '2b56',
 '2b9j',
 '2dsh',
 '2e8r',
 '2en5',
 '2fr1',
 '2fzn',
 '2huv',
 '2itv',
 '2ixb',
 '2jd1',
 '2o07',
 '2olr',
 '2p2m',
 '2p5f',
 '2p5y',
 '2p6k',
 '2pak',
 '2pb4',
 '2pb6',
 '2pch',
 '2pdb',
 '2q0l',
 '2q7k',
 '2qbn',
 '2qo5',
 '2vpz',
 '2w14',
 '2w9h',
 '2x8h',
 '2xvi',
 '2yem',
 '2yg6',
 '2ylz',
 '2yqs',
 '2yyl',
 '2zvc',
 '3bw3',
 '3c0i',
 '3cif',
 '3cls',
 '3d78',
 '3dag',
 '3dk9',
 '3dl0',
 '3dt4',
 '3dva',
 '3f47',
 '3fce',
 '3fwg',
 '3g4i',
 '3g5s',
 '3g8d',
 '3gyj',
 '3iiu',
 '3kr6',
 '3m09',
 '3m6r',
 '3m6w',
 '3mn9',
 '3mnp',
 '3o8t',
 '3oiw',
 '3p8j',
 '3px3',
 '3qft',
 '3s1d',
 '3to6',
 '3uox',
 '3vt7',
 '3vzd',
 '3w8e',
 '3wd4',
 '3wo1',
 '3zuy',
 '3zzh',
 '4awt',
 '4c04',
 '4c72',
 '4ca6',
 '4ckj',
 '4d04',
 '4dpu',
 '4e2z',
 '4eeu',
 '4eu9',
 '4eud',
 '4f1o',
 '4fl2',
 '4fl3',
 '4fvr',
 '4gm4',
 '4h3q',
 '4iai',
 '4j7h',
 '4j99',
 '4jl5',
 '4ju9',
 '4k9p',
 '4kqr',
 '4l3l',
 '4l4e',
 '4m2b',
 '4m2v',
 '4mfq',
 '4mkh',
 '4mv4',
 