In [1]:
'''Aim to find distances between all pairs of atoms given a fixed asym_id, seq_id and comp_id.
First level of computation. And then return the distance between closest pair of atoms in it.'''

'Aim to find distances between all pairs of atoms given a fixed asym_id, seq_id and comp_id.\nFirst level of computation. And then return the distance between closest pair of atoms in it.'

In [2]:
'''To read a single xml file and organise all the required elements of each atom in separate lists
and finally create a dictionary that has seq id as key and a corresponding (n_atom_in_seq,3) ndarray as value'''

'To read a single xml file and organise all the required elements of each atom in separate lists\nand finally create a dictionary that has seq id as key and a corresponding (n_atom_in_seq,3) ndarray as value'

In [3]:
#importing necessary libraries

from lxml import etree #Using the lxml library
import numpy as np #For creating ndarrays
import scipy # For finding distances
from scipy import spatial # For finding distances
from collections import namedtuple # For creating a custom tuple of asym_id, seq_id and comp_id as key

In [4]:
# function takes in the filename of the pdb file as a parameter and returns lists correspoding to x,y,z co-ordinates
# asymId, atomId, CompoundId and SequenceId along with number of atoms in the complex called indexes
# Using auth_versions

def readPdbxml(filename):
    pdbx = etree.parse(filename) #Directly read the xml file
    root = pdbx.getroot() #Get the root and use it as a reference for further operations.
    
    #finding at what index normal ATOMs end and poly sequences start coming
    
    indexes = len(root[0]) # Total valid ATOM ids 
    
    x_coord = []
    y_coord = []
    z_coord = []
    asym_id = []
    atom_id = []
    comp_id = []
    seq_id = []
    
    for each_i in range(indexes):
        x_coord.append(float(root[0][each_i][1].text))
        y_coord.append(float(root[0][each_i][2].text))
        z_coord.append(float(root[0][each_i][3].text))
        asym_id.append(root[0][each_i][4].text)
        atom_id.append(root[0][each_i][5].text)
        comp_id.append(root[0][each_i][6].text)
        seq_id.append(root[0][each_i][7].text)
        
    return x_coord,y_coord,z_coord,asym_id,atom_id,comp_id,seq_id,indexes
    
    

In [5]:
# function to create a dictionary that has seq_id as key and a list of lists as values
# where each list has 3 values,ie, x,y,z co-ordinate

def seq_dict(seq_id, x_coord, y_coord, z_coord,indexes):
    
    seq_data = {}
    
    for each_index in range(indexes):
      
        level1_key = namedtuple("level1_key",["asym_id", "seq_id", "comp_id"])
        key = level1_key(asym_id[each_index],int(seq_id[each_index]),comp_id[each_index])
        location = np.asarray([x_coord[each_index], y_coord[each_index], z_coord[each_index]])
        seq_data.setdefault(key, []).append(location)
    
    return seq_data
    

In [7]:
# Function takes in a dict with list of numpy arrays as values and the level1 key as key and returns 
# a dict with n*3 numpy arrays ( no more lists) as values and same keys

def lists_to_ndarrays(seq_data):

    dict_ndarr_seq = {}

    for key in seq_data:
        #store as nd array
        value = np.array(seq_data[key])
        dict_ndarr_seq[key] = value
    
    return dict_ndarr_seq

In [8]:
# Returns the interaction between each atom in a given compound

def interaction_matrix(dict_ndarr_seq):
    
    dict_interaction_each_atom = {}
    
    for each_key in dict_ndarr_seq:
        
        interaction = scipy.spatial.distance.cdist(dict_ndarr_seq[each_key], dict_ndarr_seq[each_key],'euclidean')
        
        dict_interaction_each_atom.update({each_key:interaction})
        
    return dict_interaction_each_atom

In [9]:
# Do this min value thing for all compounds by iterating over all keys in a loop

'''Functions takes in dict_ndarr_seq as input and returns a dict that has the level1_key as key and the diatance
between closest pair of atoms in the given compound of given seq_id and given asym_id  as value'''

def closest_atoms_dict(dict_ndarr_seq):
    
    dict_min_pairwise_atom_dist = {}
    
    for each_key in dict_ndarr_seq:
        
        interaction = scipy.spatial.distance.cdist(dict_ndarr_seq[each_key], dict_ndarr_seq[each_key],'euclidean')
        closest_atomic_pair = np.min(interaction[np.nonzero(interaction)])
        
        dict_min_pairwise_atom_dist.update({each_key:closest_atomic_pair})
        
    return dict_min_pairwise_atom_dist


In [12]:
cd dataset/

/home/arrayslayer/Documents/Acads/2-2/SOP/dataset


In [13]:
# Function calls

x_coord,y_coord,z_coord,asym_id,atom_id,comp_id,seq_id,indexes = readPdbxml('1a9n.xml')
seq_data = seq_dict(seq_id,x_coord,y_coord,z_coord,indexes)
dict_ndarr_seq = lists_to_ndarrays(seq_data)
interaction = interaction_matrix(dict_ndarr_seq)
dict_min_pairwise_atom_dist = closest_atoms_dict(dict_ndarr_seq)

In [27]:
'''These numbers are different because the level1 tuple key has multiple values assosiated with it'''

print(indexes)
print(len(dict_ndarr_seq))

5188
571


In [29]:
for i in range(len(seq_id)):
    print(asym_id[i] + " " + seq_id[i] + " " + comp_id[i])

Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 0 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 1 C
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 2 U
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 3 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 4 G
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 5 U
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 6 A
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 7 U
Q 8 U
Q 8 

A 106 LEU
A 106 LEU
A 106 LEU
A 106 LEU
A 107 ASP
A 107 ASP
A 107 ASP
A 107 ASP
A 107 ASP
A 107 ASP
A 107 ASP
A 107 ASP
A 108 PRO
A 108 PRO
A 108 PRO
A 108 PRO
A 108 PRO
A 108 PRO
A 108 PRO
A 109 LEU
A 109 LEU
A 109 LEU
A 109 LEU
A 109 LEU
A 109 LEU
A 109 LEU
A 109 LEU
A 110 ALA
A 110 ALA
A 110 ALA
A 110 ALA
A 110 ALA
A 111 SER
A 111 SER
A 111 SER
A 111 SER
A 111 SER
A 111 SER
A 112 LEU
A 112 LEU
A 112 LEU
A 112 LEU
A 112 LEU
A 112 LEU
A 112 LEU
A 112 LEU
A 113 LYS
A 113 LYS
A 113 LYS
A 113 LYS
A 113 LYS
A 114 SER
A 114 SER
A 114 SER
A 114 SER
A 114 SER
A 114 SER
A 115 LEU
A 115 LEU
A 115 LEU
A 115 LEU
A 115 LEU
A 115 LEU
A 115 LEU
A 115 LEU
A 116 THR
A 116 THR
A 116 THR
A 116 THR
A 116 THR
A 116 THR
A 116 THR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 117 TYR
A 118 LEU
A 118 LEU
A 118 LEU
A 118 LEU
A 118 LEU
A 118 LEU
A 118 LEU
A 118 LEU
A 119 CYS
A 119 CYS
A 119 CYS
A 119 CYS
A 119 CYS
A 119 CYS
A 120 ILE
A 120 ILE


B 55 ALA
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 56 PHE
B 57 VAL
B 57 VAL
B 57 VAL
B 57 VAL
B 57 VAL
B 57 VAL
B 57 VAL
B 58 ILE
B 58 ILE
B 58 ILE
B 58 ILE
B 58 ILE
B 58 ILE
B 58 ILE
B 58 ILE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 59 PHE
B 60 LYS
B 60 LYS
B 60 LYS
B 60 LYS
B 60 LYS
B 61 GLU
B 61 GLU
B 61 GLU
B 61 GLU
B 61 GLU
B 61 GLU
B 61 GLU
B 61 GLU
B 61 GLU
B 62 LEU
B 62 LEU
B 62 LEU
B 62 LEU
B 62 LEU
B 62 LEU
B 62 LEU
B 62 LEU
B 63 GLY
B 63 GLY
B 63 GLY
B 63 GLY
B 64 SER
B 64 SER
B 64 SER
B 64 SER
B 64 SER
B 64 SER
B 65 SER
B 65 SER
B 65 SER
B 65 SER
B 65 SER
B 65 SER
B 66 THR
B 66 THR
B 66 THR
B 66 THR
B 66 THR
B 66 THR
B 66 THR
B 67 ASN
B 67 ASN
B 67 ASN
B 67 ASN
B 67 ASN
B 67 ASN
B 67 ASN
B 67 ASN
B 68 ALA
B 68 ALA
B 68 ALA
B 68 ALA
B 68 ALA
B 69 LEU
B 69 LEU
B 69 LEU
B 69 LEU
B 69 LEU
B 69 LEU
B 69 LEU
B 69 LEU
B 70 ARG
B 70 ARG
B 70 ARG
B 70 ARG
B 70 ARG
B 70 ARG
B 70 ARG
B

In [14]:
dict_ndarr_seq['Q',0,'C']

array([[ 12.992,  13.174,   4.15 ],
       [ 12.941,  12.992,   5.572],
       [ 11.563,  12.519,   5.976],
       [ 10.589,  13.567,   5.735],
       [ 11.015,  11.307,   5.226],
       [ 11.382,  10.115,   5.913],
       [  9.503,  11.532,   5.384],
       [  9.07 ,  11.077,   6.663],
       [  9.386,  13.038,   5.199],
       [  9.269,  13.435,   3.756],
       [  7.984,  13.456,   3.215],
       [  7.032,  13.155,   3.95 ],
       [  7.8  ,  13.798,   1.922],
       [  8.828,  14.119,   1.137],
       [  8.578,  14.449,  -0.122],
       [ 10.15 ,  14.1  ,   1.673],
       [ 10.319,  13.757,   2.966]])

In [15]:
interaction[('Q',0,'C')]

array([[ 0.        ,  1.43450654,  2.40942773,  2.9053542 ,  2.92437925,
         3.88043168,  4.04870856,  5.10829345,  3.75794266,  3.75287703,
         5.10233407,  5.96338503,  5.6842083 ,  5.22590566,  6.27367556,
         3.8820058 ,  2.9810525 ],
       [ 1.43450654,  0.        ,  1.51189583,  2.42674638,  2.58232783,
         3.28996824,  3.73989144,  4.45445249,  3.57481048,  4.12039913,
         5.5084112 ,  6.12974176,  6.3562581 ,  6.15273297,  7.31985341,
         4.92133579,  3.77509536],
       [ 2.40942773,  1.51189583,  0.        ,  1.45088283,  1.52700622,
         2.41162725,  2.35971036,  2.96080766,  2.36905445,  3.32112812,
         4.61631141,  5.00391177,  5.67722873,  5.7841288 ,  7.05838005,
         4.79707609,  3.48429046],
       [ 2.9053542 ,  2.42674638,  1.45088283,  0.        ,  2.35545261,
         3.54638365,  2.33319995,  3.06082424,  1.41927658,  2.38249134,
         3.62611996,  4.00102712,  4.7297834 ,  4.9545362 ,  6.25511742,
         4.12027353

In [None]:
# HOW TO DO IF SAME ASYM_ID, COMP_ID AND SEQ_ID FOR >1 RN IT IS OVER WRITING NEED TO CHANGE THAT

# You want to do it for a (COMP_ID,SEQ_ID)key inside a given ASYM_ID?