In [1]:
%pylab inline
import dfi
import dfi.fasta_convert
import pandas as pd
import dfi.fastaseq 
from dfi.fastaseq import mapres
from __future__ import print_function

Populating the interactive namespace from numpy and matplotlib


In [2]:
def calc_hessian(x,y,z,Verbose=False):
    """
    Calculate the hessian given the coordinates 
    
    Input
    -----
    (x,y,z) numpy array
       Must all be the same length
    Output
    ------
    kirchhoff: NxN numpy matrix
       
    """
    cutoff = 10
    gamma = 1
    xyz = np.column_stack((x,y,z))
    numres = xyz.shape[0]
    kirchhoff = np.zeros((numres,numres))
    for i in range(numres):
        xyz_i = xyz[i]
        i_p1 = i + 1
        xyz_ij = xyz[i_p1:] - xyz_i
        xyz_ij2=np.multiply(xyz_ij,xyz_ij)
        cutoff2 = cutoff * cutoff
        for j, dist2 in enumerate(xyz_ij2.sum(1)):
            if dist2 > cutoff2:
                continue
            if Verbose:
                print(j, dist2)
            j += i_p1
            kirchhoff[i,j] = -gamma 
            kirchhoff[j,i] = -gamma 
            kirchhoff[i,i] += gamma 
            kirchhoff[j,j] += gamma
    return kirchhoff

In [3]:
# Invert this matrix
def invert_kirchhoff(kirchhoff):
    from scipy import linalg as LA
    U, w, Vt = LA.svd(kirchhoff,full_matrices=False)
    S = LA.diagsvd(w,len(w),len(w))
    np.allclose(kirchhoff,np.dot(U,np.dot(S,Vt)))
    tol = 1e-6
    singular = w < tol
    assert np.sum(singular) == 1.
    invw = 1/w
    invw[singular] = 0
    inv_kirchhoff = np.dot(np.dot(U,np.diag(invw)),Vt)
    return inv_kirchhoff

In [4]:
def _build_kirchhoff(evod_file,n):
    """
    Creates a kirchoff matrix using EVfold contacts
    Input
    -----
    evfold input file: str
       file from evfold
    n: size of the square matrix
    Output
    ------
    kirchoff: NxN numpy matrix
       output matrix
    """

    chain = []
    chain_connection = np.zeros((n,n))
    
    #assign a -1 for residues in contact in the chain
    for i in range(2, n-2):
        chain_connection[i, i+1] = -1
        chain_connection[i, i+2] = -1
        #chain_connection[i, i+3] = -1
        chain_connection[i+1, i] = -1
        chain_connection[i+2, i] = -1
        #chain_connection[i+3, i] = -1
        chain_connection[i, i-1] = -1
        chain_connection[i, i-2] = -1
        #chain_connection[i, i-3] = -1
        chain_connection[i-1, i] = -1
        chain_connection[i-2, i] = -1
        #chain_connection[i-3, i] = -1
        
        chain.append([i, i+1, chain_connection[i, i+1]])
        chain.append([i, i+2, chain_connection[i, i+2]])
        #chain.append([i, i+3, chain_connection[i, i+3]])
        chain.append([i+1, i, chain_connection[i+1, i]])
        chain.append([i+2, i, chain_connection[i+2, i]])
        #chain.append([i+3, i, chain_connection[i+3, i]])
        chain.append([i, i-1, chain_connection[i, i-1]])
        chain.append([i, i-2, chain_connection[i, i-2]])
        #chain.append([i, i-3, chain_connection[i, i-3]])
        chain.append([i-1, i, chain_connection[i-1, i]])
        chain.append([i-2, i, chain_connection[i-2, i]])
        #chain.append([i-3, i, chain_connection[i-3, i]])
        
    #assign a -1 for EC pairs
    evol = []
    contact_pairs = open(evod_file, 'rU').readlines() 
    evol_const = np.zeros((n,n))
    for line in contact_pairs:
        a = line.split()
        i = int(a[0]) - 1 
        j = int(a[2]) - 1 
        if (chain_connection[i, j] != -1):
            evol_const[i, j] = -1.0*float(a[5])
            evol_const[j, i] = -1.0*float(a[5])
            evol.append([i, j, evol_const[i, j]])
            evol.append([j, i, evol_const[j, i]])
    
    #build kirchoff matrix
    kirchhoff = np.zeros((n,n))
    kirchhoff = chain_connection + evol_const
    print('generated kirchhoff using evolutionary constraints')
    print('kirchhoff shape: ', kirchhoff.shape)
    
    #calculate the diagonal
    diag = []
    for i in range(0, n):
        kirchhoff[i, i] = -np.sum(kirchhoff[i])
        diag.append([i, i, kirchhoff[i, i]])
    
    #put everything together for a file
    all = chain + evol + diag
    f = open('evfold_kirchhoff.txt', 'w')
    for x in all:
        f.write('%s \t %s \t %s \n' % (x[0], x[1], x[2]))
    f.close()
    
    return kirchhoff;

In [5]:
#dfi.fetch_pdb('5pnt')
mdlpdbfile='1CB0.pdbmdl'
exptpdbfile='1cb0.pdb'
evoDfile='1CB0_MI_DI.txt'
uniprotID='Q13126'

In [6]:
#PDB MODEL
ATOMS = dfi.pdbio.pdb_reader(mdlpdbfile,CAonly=True)
x,y,z = dfi.getcoords(ATOMS)
numres = len(x)
mdlseq=[mapres[atom.res_name] for atom in ATOMS]
kirchhoff = calc_hessian(x,y,z)
inv_kirchhoff = invert_kirchhoff(kirchhoff)
mdl_diag = np.array([inv_kirchhoff[i,i] for i in range(numres)])

Read 273 atoms from the 1CB0.pdbmdl


In [7]:
#Experimental PDB 
expt_ATOM = dfi.pdbio.pdb_reader(exptpdbfile,CAonly=True)
expt_betafactors = np.array([atom.temp_factor for atom in expt_ATOM])
exptseq=[mapres[atom.res_name] for atom in expt_ATOM]

Read 268 atoms from the 1cb0.pdb


In [8]:
#EVFOLD Contacts 
str_seq=''.join( dfi.fastaseq.get_fastaseq(uniprotID).split('\n')[1:] )
fastaseq=[s for s in str_seq]
numseq = len(fastaseq)
evodkirchhoff=_build_kirchhoff(evoDfile,numseq)
inv_evodkirchhoff=invert_kirchhoff(evodkirchhoff)
evo_diag = np.array([inv_evodkirchhoff[i,i] for i in range(numseq)])

generated kirchhoff using evolutionary constraints
kirchhoff shape:  (283, 283)


# Need to align sequences properly

## Grab the 5pnt fasta sequence 

In [9]:
print(len(fastaseq))
print(len(mdlseq))
print(len(exptseq))

283
273
268


In [10]:
dfx = pd.DataFrame()

In [11]:
dfx['fasta_R'] =fastaseq

In [12]:
dfx['evfold_B'] = evo_diag

# Ned to align to the fasta sequence 

In [13]:
exptseq[:10]

['A', 'V', 'K', 'I', 'G', 'I', 'I', 'G', 'G', 'T']

In [14]:
fastaseq[:10]

['M', 'A', 'S', 'G', 'T', 'T', 'T', 'T', 'A', 'V']

In [15]:
align_seq=[]
i=0
for pdb_aa in mdlseq:
    for fasta_aa in fastaseq[i:]:
        if pdb_aa == fasta_aa:
            #print(i, fasta_aa)
            align_seq.append(fasta_aa)
            i+=1
            break
        #print(i,np.nan)
        align_seq.append(np.nan)
        i+=1
while( len(align_seq) < len(fastaseq) ):
    align_seq.append(np.nan)

In [16]:
len(align_seq)

283

In [17]:
dfx['expt_R'] = align_seq

In [18]:
import math
i=0
align_bfac=[]
for pos in align_seq:
    if i >= len(expt_betafactors):
        break
    #print(pos)
    if type(pos) == float:
        #print(i,np.nan,pos)
        align_bfac.append(np.nan)
    else:
        #print(i,betafactors[i],pos)
        #print(i)
        #print(pos)
        align_bfac.append(expt_betafactors[i])
        i+=1
while( len(align_bfac) < len(fastaseq) ):
    align_bfac.append(np.nan)

In [19]:
len(fastaseq)

283

In [20]:
len(align_bfac)

283

In [21]:
dfx['expt_B'] = align_bfac

In [22]:
len(align_bfac)

283

In [23]:
align_seq=[]
i=0
for pdb_aa in mdlseq:
    for fasta_aa in fastaseq[i:]:
        if pdb_aa == fasta_aa:
            #print(i, fasta_aa)
            align_seq.append(fasta_aa)
            i+=1
            break
        #print(i,np.nan)
        align_seq.append(np.nan)
        i+=1
while( len(align_seq) < len(fastaseq) ):
    align_seq.append(np.nan)

In [24]:
dfx['mdl_R'] = align_seq

In [25]:
import math
i=0
align_bfac=[]
for pos in align_seq:
    if i >= len(mdl_diag):
        break
    #print(pos)
    if type(pos) == float:
        #print(i,np.nan,pos)
        align_bfac.append(np.nan)
    else:
        #print(i,betafactors[i],pos)
        #print(i)
        #print(pos)
        align_bfac.append(mdl_diag[i])
        i+=1
while( len(align_bfac) < len(fastaseq) ):
    align_bfac.append(np.nan)

In [26]:
dfx['mdl_B'] = align_bfac

In [27]:
dfx

Unnamed: 0,fasta_R,evfold_B,expt_R,expt_B,mdl_R,mdl_B
0,M,2.950474,,,,
1,A,2.232908,A,25.16,A,0.099684
2,S,1.957541,,,,
3,G,1.751409,,,,
4,T,1.554916,,,,
5,T,1.362534,,,,
6,T,1.169132,,,,
7,T,0.983747,,,,
8,A,0.782516,,,,
9,V,0.626694,V,19.73,V,0.064156


In [31]:
dfx.to_csv('DI-bfactor.csv',index=False)

In [38]:
dfx[[i for i in dfx.columns if '_B' in i]].dropna().corr()

Unnamed: 0,evfold_B,expt_B,mdl_B
evfold_B,1.0,0.376102,0.255568
expt_B,0.376102,1.0,0.390818
mdl_B,0.255568,0.390818,1.0
