In [1]:
%pylab inline
import dfi

Populating the interactive namespace from numpy and matplotlib


In [2]:
dfi.fetch_pdb('5pnt')
ATOMS = dfi.pdbio.pdb_reader('5pnt.pdb',CAonly=True)
x,y,z = dfi.getcoords(ATOMS)
numres = len(x)

Read 157 atoms from the 5pnt.pdb


In [3]:

def calc_hessian(x,y,z,Verbose=False):
    """
    Calculate the hessian given the coordinates 
    
    Input
    -----
    (x,y,z) numpy array
       Must all be the same length
    Output
    ------
    kirchhoff: NxN numpy matrix
       
    """
    cutoff = 10
    gamma = 1
    xyz = np.column_stack((x,y,z))
    numres = xyz.shape[0]
    kirchhoff = np.zeros((numres,numres))
    for i in range(numres):
        xyz_i = xyz[i]
        i_p1 = i + 1
        xyz_ij = xyz[i_p1:] - xyz_i
        xyz_ij2=np.multiply(xyz_ij,xyz_ij)
        cutoff2 = cutoff * cutoff
        for j, dist2 in enumerate(xyz_ij2.sum(1)):
            if dist2 > cutoff2:
                continue
            if Verbose:
                print(j, dist2)
            j += i_p1
            kirchhoff[i,j] = -gamma 
            kirchhoff[j,i] = -gamma 
            kirchhoff[i,i] += gamma 
            kirchhoff[j,j] += gamma
    return kirchhoff

In [4]:
kirchhoff = calc_hessian(x,y,z)

# Invert this matrix

In [5]:
def invert_kirchhoff(kirchhoff):
    from scipy import linalg as LA
    U, w, Vt = LA.svd(kirchhoff,full_matrices=False)
    S = LA.diagsvd(w,len(w),len(w))
    np.allclose(kirchhoff,np.dot(U,np.dot(S,Vt)))
    tol = 1e-6
    singular = w < tol
    assert np.sum(singular) == 1.
    invw = 1/w
    invw[singular] = 0
    inv_kirchhoff = np.dot(np.dot(U,np.diag(invw)),Vt)
    return inv_kirchhoff

In [6]:
inv_kirchhoff = invert_kirchhoff(kirchhoff)

In [7]:
diag = np.array([inv_kirchhoff[i,i] for i in range(numres)])

In [8]:
betafactors = np.array([atom.temp_factor for atom in ATOMS])

In [9]:
np.corrcoef(diag,betafactors).round(2)[0,1]

0.62

In [10]:
def _build_kirchhoff(evod_file,n):
    """
    Creates a kirchoff matrix using EVfold contacts
    Input
    -----
    evfold input file: str
       file from evfold
    n: size of the square matrix
    Output
    ------
    kirchoff: NxN numpy matrix
       output matrix
    """

    chain = []
    chain_connection = np.zeros((n,n))
    
    #assign a -1 for residues in contact in the chain
    for i in range(2, n-2):
        chain_connection[i, i+1] = -1
        chain_connection[i, i+2] = -1
        #chain_connection[i, i+3] = -1
        chain_connection[i+1, i] = -1
        chain_connection[i+2, i] = -1
        #chain_connection[i+3, i] = -1
        chain_connection[i, i-1] = -1
        chain_connection[i, i-2] = -1
        #chain_connection[i, i-3] = -1
        chain_connection[i-1, i] = -1
        chain_connection[i-2, i] = -1
        #chain_connection[i-3, i] = -1
        
        chain.append([i, i+1, chain_connection[i, i+1]])
        chain.append([i, i+2, chain_connection[i, i+2]])
        #chain.append([i, i+3, chain_connection[i, i+3]])
        chain.append([i+1, i, chain_connection[i+1, i]])
        chain.append([i+2, i, chain_connection[i+2, i]])
        #chain.append([i+3, i, chain_connection[i+3, i]])
        chain.append([i, i-1, chain_connection[i, i-1]])
        chain.append([i, i-2, chain_connection[i, i-2]])
        #chain.append([i, i-3, chain_connection[i, i-3]])
        chain.append([i-1, i, chain_connection[i-1, i]])
        chain.append([i-2, i, chain_connection[i-2, i]])
        #chain.append([i-3, i, chain_connection[i-3, i]])
        
    #assign a -1 for EC pairs
    evol = []
    contact_pairs = open(evod_file, 'rU').readlines() 
    evol_const = np.zeros((n,n))
    for line in contact_pairs:
        a = line.split()
        i = int(a[0]) - 1 
        j = int(a[2]) - 1 
        if (chain_connection[i, j] != -1):
            evol_const[i, j] = -1.0*float(a[5])
            evol_const[j, i] = -1.0*float(a[5])
            evol.append([i, j, evol_const[i, j]])
            evol.append([j, i, evol_const[j, i]])
    
    #build kirchoff matrix
    kirchhoff = np.zeros((n,n))
    kirchhoff = chain_connection + evol_const
    print 'generated kirchhoff using evolutionary constraints'
    print 'kirchhoff shape: ', kirchhoff.shape
    
    #calculate the diagonal
    diag = []
    for i in range(0, n):
        kirchhoff[i, i] = -np.sum(kirchhoff[i])
        diag.append([i, i, kirchhoff[i, i]])
    
    #put everything together for a file
    all = chain + evol + diag
    f = open('evfold_kirchhoff.txt', 'w')
    for x in all:
        f.write('%s \t %s \t %s \n' % (x[0], x[1], x[2]))
    f.close()
    
    return kirchhoff;

In [11]:
evoDfile='./data/5pnt_MI_DI.txt'

In [12]:
evodkirchhoff=_build_kirchhoff(evoDfile,158)

generated kirchhoff using evolutionary constraints
kirchhoff shape:  (158, 158)


In [13]:
numseq=evodkirchhoff.shape[0]

In [14]:
inv_evodkirchhoff=invert_kirchhoff(evodkirchhoff)

In [15]:
evo_diag = np.array([inv_evodkirchhoff[i,i] for i in range(numseq)])

In [16]:
np.corrcoef(diag,evo_diag[1:]).round(2)[0,1]

0.66000000000000003

In [17]:
np.corrcoef(betafactors,evo_diag[1:]).round(2)[0,1]

0.72999999999999998

In [18]:
np.corrcoef(diag,betafactors).round(2)[0,1]

0.62

# Need to align sequences properly

## Grab the 5pnt fasta sequence 

In [19]:
import dfi.fastaseq 

In [20]:
str_seq=''.join( dfi.fastaseq.get_fastaseq('P24666').split('\n')[1:] )

In [21]:
seq=[s for s in str_seq]

In [22]:
import pandas as pd

In [23]:
dfx = pd.DataFrame()

In [24]:
dfx['fasta_R'] =seq

In [25]:
evodkirchhoff.shape

(158, 158)

In [26]:
dfx['b-evfold'] = evo_diag

In [27]:
dfx.head()

Unnamed: 0,fasta_R,b-evfold
0,M,2.367071
1,A,1.653347
2,E,1.379729
3,Q,1.175689
4,A,0.987923


In [28]:
diag.shape

(157,)

# Ned to align to the fasta sequence 

In [29]:
import dfi.fasta_convert

In [30]:
pdbseq=dfi.fasta_convert.get_seq('5pnt.pdb')

Read 157 atoms from the 5pnt.pdb


In [31]:
pdbseq[:10]

['A', 'E', 'Q', 'A', 'T', 'K', 'S', 'V', 'L', 'F']

In [32]:
seq[:10]

['M', 'A', 'E', 'Q', 'A', 'T', 'K', 'S', 'V', 'L']

In [45]:
from __future__ import print_function
align_seq=[]
i=0
for pdb_aa in pdbseq:
    for fasta_aa in seq[i:]:
        if pdb_aa == fasta_aa:
            print(i, fasta_aa)
            align_seq.append(fasta_aa)
            i+=1
            break
        print(i,np.nan)
        align_seq.append(np.nan)
        i+=1
    

0 nan
1 A
2 E
3 Q
4 A
5 T
6 K
7 S
8 V
9 L
10 F
11 V
12 C
13 L
14 G
15 N
16 I
17 C
18 R
19 S
20 P
21 I
22 A
23 E
24 A
25 V
26 F
27 R
28 K
29 L
30 V
31 T
32 D
33 Q
34 N
35 I
36 S
37 E
38 N
39 W
40 R
41 V
42 D
43 S
44 A
45 A
46 T
47 S
48 G
49 Y
50 E
51 I
52 G
53 N
54 P
55 P
56 D
57 Y
58 R
59 G
60 Q
61 S
62 C
63 M
64 K
65 R
66 H
67 G
68 I
69 P
70 M
71 S
72 H
73 V
74 A
75 R
76 Q
77 I
78 T
79 K
80 E
81 D
82 F
83 A
84 T
85 F
86 D
87 Y
88 I
89 L
90 C
91 M
92 D
93 E
94 S
95 N
96 L
97 R
98 D
99 L
100 N
101 R
102 K
103 S
104 N
105 Q
106 V
107 K
108 T
109 C
110 K
111 A
112 K
113 I
114 E
115 L
116 L
117 G
118 S
119 Y
120 D
121 P
122 Q
123 K
124 Q
125 L
126 I
127 I
128 E
129 D
130 P
131 Y
132 Y
133 G
134 N
135 D
136 S
137 D
138 F
139 E
140 T
141 V
142 Y
143 Q
144 Q
145 C
146 V
147 R
148 C
149 C
150 R
151 A
152 F
153 L
154 E
155 K
156 A
157 H


In [49]:
dfx['pdb_R'] = align_seq

In [97]:
import math
i=0
align_bfac=[]
for pos in align_seq:
    if i >= len(betafactors):
        break
    #print(pos)
    if type(pos) == float:
        #print(i,np.nan,pos)
        align_bfac.append(np.nan)
    else:
        #print(i,betafactors[i],pos)
        #print(i)
        #print(pos)
        align_bfac.append(betafactors[i])
        i+=1


In [98]:
dfx['pdb_Bfactor'] = align_bfac

In [100]:
dfx.to_csv('test.csv')