In [1]:
%pylab inline
import dfi

Populating the interactive namespace from numpy and matplotlib


In [2]:
dfi.fetch_pdb('5pnt')
ATOMS = dfi.pdbio.pdb_reader('5pnt.pdb',CAonly=True)
x,y,z = dfi.getcoords(ATOMS)
numres = len(x)

Read 157 atoms from the 5pnt.pdb


In [3]:

def calc_hessian(x,y,z,Verbose=False):
    """
    Calculate the hessian given the coordinates 
    
    Input
    -----
    (x,y,z) numpy array
       Must all be the same length
    Output
    ------
    kirchhoff: NxN numpy matrix
       
    """
    cutoff = 10
    gamma = 1
    xyz = np.column_stack((x,y,z))
    numres = xyz.shape[0]
    kirchhoff = np.zeros((numres,numres))
    for i in range(numres):
        xyz_i = xyz[i]
        i_p1 = i + 1
        xyz_ij = xyz[i_p1:] - xyz_i
        xyz_ij2=np.multiply(xyz_ij,xyz_ij)
        cutoff2 = cutoff * cutoff
        for j, dist2 in enumerate(xyz_ij2.sum(1)):
            if dist2 > cutoff2:
                continue
            if Verbose:
                print(j, dist2)
            j += i_p1
            kirchhoff[i,j] = -gamma 
            kirchhoff[j,i] = -gamma 
            kirchhoff[i,i] += gamma 
            kirchhoff[j,j] += gamma
    return kirchhoff

In [4]:
kirchhoff = calc_hessian(x,y,z)

# Invert this matrix

In [5]:
def invert_kirchhoff(kirchhoff):
    from scipy import linalg as LA
    U, w, Vt = LA.svd(kirchhoff,full_matrices=False)
    S = LA.diagsvd(w,len(w),len(w))
    np.allclose(kirchhoff,np.dot(U,np.dot(S,Vt)))
    tol = 1e-6
    singular = w < tol
    assert np.sum(singular) == 1.
    invw = 1/w
    invw[singular] = 0
    inv_kirchhoff = np.dot(np.dot(U,np.diag(invw)),Vt)
    return inv_kirchhoff

In [6]:
inv_kirchhoff = invert_kirchhoff(kirchhoff)

In [7]:
diag = np.array([inv_kirchhoff[i,i] for i in range(numres)])

In [8]:
betafactors = np.array([atom.temp_factor for atom in ATOMS])

In [9]:
np.corrcoef(diag,betafactors).round(2)[0,1]

0.62

In [10]:
def _build_kirchhoff(evod_file,n):
    """
    Creates a kirchoff matrix using EVfold contacts
    Input
    -----
    evfold input file: str
       file from evfold
    n: size of the square matrix
    Output
    ------
    kirchoff: NxN numpy matrix
       output matrix
    """

    chain = []
    chain_connection = np.zeros((n,n))
    
    #assign a -1 for residues in contact in the chain
    for i in range(2, n-2):
        chain_connection[i, i+1] = -1
        chain_connection[i, i+2] = -1
        #chain_connection[i, i+3] = -1
        chain_connection[i+1, i] = -1
        chain_connection[i+2, i] = -1
        #chain_connection[i+3, i] = -1
        chain_connection[i, i-1] = -1
        chain_connection[i, i-2] = -1
        #chain_connection[i, i-3] = -1
        chain_connection[i-1, i] = -1
        chain_connection[i-2, i] = -1
        #chain_connection[i-3, i] = -1
        
        chain.append([i, i+1, chain_connection[i, i+1]])
        chain.append([i, i+2, chain_connection[i, i+2]])
        #chain.append([i, i+3, chain_connection[i, i+3]])
        chain.append([i+1, i, chain_connection[i+1, i]])
        chain.append([i+2, i, chain_connection[i+2, i]])
        #chain.append([i+3, i, chain_connection[i+3, i]])
        chain.append([i, i-1, chain_connection[i, i-1]])
        chain.append([i, i-2, chain_connection[i, i-2]])
        #chain.append([i, i-3, chain_connection[i, i-3]])
        chain.append([i-1, i, chain_connection[i-1, i]])
        chain.append([i-2, i, chain_connection[i-2, i]])
        #chain.append([i-3, i, chain_connection[i-3, i]])
        
    #assign a -1 for EC pairs
    evol = []
    contact_pairs = open(evod_file, 'rU').readlines() 
    evol_const = np.zeros((n,n))
    for line in contact_pairs:
        a = line.split()
        i = int(a[0]) - 1 
        j = int(a[2]) - 1 
        if (chain_connection[i, j] != -1):
            evol_const[i, j] = -1.0*float(a[5])
            evol_const[j, i] = -1.0*float(a[5])
            evol.append([i, j, evol_const[i, j]])
            evol.append([j, i, evol_const[j, i]])
    
    #build kirchoff matrix
    kirchhoff = np.zeros((n,n))
    kirchhoff = chain_connection + evol_const
    print 'generated kirchhoff using evolutionary constraints'
    print 'kirchhoff shape: ', kirchhoff.shape
    
    #calculate the diagonal
    diag = []
    for i in range(0, n):
        kirchhoff[i, i] = -np.sum(kirchhoff[i])
        diag.append([i, i, kirchhoff[i, i]])
    
    #put everything together for a file
    all = chain + evol + diag
    f = open('evfold_kirchhoff.txt', 'w')
    for x in all:
        f.write('%s \t %s \t %s \n' % (x[0], x[1], x[2]))
    f.close()
    
    return kirchhoff;

In [11]:
evoDfile='./data/5pnt_MI_DI.txt'

In [12]:
evodkirchhoff=_build_kirchhoff(evoDfile,158)

generated kirchhoff using evolutionary constraints
kirchhoff shape:  (158, 158)


In [14]:
numseq=evodkirchhoff.shape[0]

In [16]:
inv_evodkirchhoff=invert_kirchhoff(evodkirchhoff)

In [17]:
evo_diag = np.array([inv_evodkirchhoff[i,i] for i in range(numseq)])

In [18]:
np.corrcoef(diag,evo_diag[1:]).round(2)[0,1]

0.66000000000000003

In [19]:
np.corrcoef(betafactors,evo_diag[1:]).round(2)[0,1]

0.72999999999999998

In [20]:
np.corrcoef(diag,betafactors).round(2)[0,1]

0.62

# Need to align sequences properly

## Grab the 5pnt fasta sequence 

In [21]:
import dfi.fastaseq 

In [61]:
str_seq=''.join( dfi.fastaseq.get_fastaseq('P24666').split('\n')[1:] )

In [62]:
seq=[s for s in str_seq]

In [63]:
import pandas as pd

In [64]:
dfx = pd.DataFrame()

In [65]:
dfx['fasta_R'] =seq

In [69]:
evodkirchhoff.shape

(158, 158)

In [70]:
dfx['b-evfold'] = evo_diag

In [72]:
dfx.head()

Unnamed: 0,fasta_R,b-evfold
0,M,2.367071
1,A,1.653347
2,E,1.379729
3,Q,1.175689
4,A,0.987923


In [74]:
diag.shape

(157,)

# Ned to align to the fasta sequence 

In [75]:
import dfi.fasta_convert

In [80]:
pdbseq=dfi.fasta_convert.get_seq('5pnt.pdb')

Read 157 atoms from the 5pnt.pdb


In [82]:
pdbseq[:10]

['A', 'E', 'Q', 'A', 'T', 'K', 'S', 'V', 'L', 'F']

In [83]:
seq[:10]

['M', 'A', 'E', 'Q', 'A', 'T', 'K', 'S', 'V', 'L']

In [95]:
from __future__ import print_function
i=0
for pdb_aa in enumerate(pdbseq):
    print(pdb_aa)
    for fasta_aa in seq[i:]:
        if pdb_aa == fasta_aa:
            print(i, fasta_aa)
            break
        i+=1

(0, 'A')
(1, 'E')
(2, 'Q')
(3, 'A')
(4, 'T')
(5, 'K')
(6, 'S')
(7, 'V')
(8, 'L')
(9, 'F')
(10, 'V')
(11, 'C')
(12, 'L')
(13, 'G')
(14, 'N')
(15, 'I')
(16, 'C')
(17, 'R')
(18, 'S')
(19, 'P')
(20, 'I')
(21, 'A')
(22, 'E')
(23, 'A')
(24, 'V')
(25, 'F')
(26, 'R')
(27, 'K')
(28, 'L')
(29, 'V')
(30, 'T')
(31, 'D')
(32, 'Q')
(33, 'N')
(34, 'I')
(35, 'S')
(36, 'E')
(37, 'N')
(38, 'W')
(39, 'R')
(40, 'V')
(41, 'D')
(42, 'S')
(43, 'A')
(44, 'A')
(45, 'T')
(46, 'S')
(47, 'G')
(48, 'Y')
(49, 'E')
(50, 'I')
(51, 'G')
(52, 'N')
(53, 'P')
(54, 'P')
(55, 'D')
(56, 'Y')
(57, 'R')
(58, 'G')
(59, 'Q')
(60, 'S')
(61, 'C')
(62, 'M')
(63, 'K')
(64, 'R')
(65, 'H')
(66, 'G')
(67, 'I')
(68, 'P')
(69, 'M')
(70, 'S')
(71, 'H')
(72, 'V')
(73, 'A')
(74, 'R')
(75, 'Q')
(76, 'I')
(77, 'T')
(78, 'K')
(79, 'E')
(80, 'D')
(81, 'F')
(82, 'A')
(83, 'T')
(84, 'F')
(85, 'D')
(86, 'Y')
(87, 'I')
(88, 'L')
(89, 'C')
(90, 'M')
(91, 'D')
(92, 'E')
(93, 'S')
(94, 'N')
(95, 'L')
(96, 'R')
(97, 'D')
(98, 'L')
(99, 'N')
(100, 'R')